mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
137 lines
3.9 KiB
Python
137 lines
3.9 KiB
Python
import os
|
|
import pathlib
|
|
|
|
import pytest
|
|
|
|
from unstructured.ingest.connector.git import GitIngestDoc, SimpleGitConfig
|
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "example-docs")
|
|
|
|
test_files = [
|
|
"layout-parser-paper-fast.jpg",
|
|
"layout-parser-paper-fast.pdf",
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
def test_process_file_metadata_include_filename(filename: str):
|
|
ingest_doc = GitIngestDoc(
|
|
path=filename,
|
|
config=SimpleGitConfig(
|
|
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
|
metadata_include="filename",
|
|
),
|
|
)
|
|
isd_elems = ingest_doc.process_file()
|
|
|
|
for elem in isd_elems:
|
|
assert set(elem["metadata"].keys()) == {"filename"}
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
def test_process_file_metadata_include_filename_pagenum(filename: str):
|
|
ingest_doc = GitIngestDoc(
|
|
path=filename,
|
|
config=SimpleGitConfig(
|
|
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
|
metadata_include="filename,page_number",
|
|
),
|
|
)
|
|
isd_elems = ingest_doc.process_file()
|
|
|
|
for elem in isd_elems:
|
|
assert set(elem["metadata"].keys()) == {"filename", "page_number"}
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
def test_process_file_metadata_exclude_filename(filename: str):
|
|
ingest_doc = GitIngestDoc(
|
|
path=filename,
|
|
config=SimpleGitConfig(
|
|
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
|
metadata_exclude="filename",
|
|
),
|
|
)
|
|
isd_elems = ingest_doc.process_file()
|
|
|
|
for elem in isd_elems:
|
|
assert "filename" not in elem["metadata"].keys()
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
def test_process_file_metadata_exclude_filename_pagenum(filename: str):
|
|
ingest_doc = GitIngestDoc(
|
|
path=filename,
|
|
config=SimpleGitConfig(
|
|
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
|
metadata_exclude="filename,page_number",
|
|
),
|
|
)
|
|
isd_elems = ingest_doc.process_file()
|
|
|
|
for elem in isd_elems:
|
|
assert "filename" not in elem["metadata"].keys()
|
|
assert "page_number" not in elem["metadata"].keys()
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
def test_process_file_fields_include_default(filename: str):
|
|
ingest_doc = GitIngestDoc(
|
|
path=filename,
|
|
config=SimpleGitConfig(
|
|
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
|
),
|
|
)
|
|
isd_elems = ingest_doc.process_file()
|
|
|
|
for elem in isd_elems:
|
|
assert {"element_id", "text", "type", "metadata"} == set(elem.keys())
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
def test_process_file_fields_include_elementid(filename: str):
|
|
ingest_doc = GitIngestDoc(
|
|
path=filename,
|
|
config=SimpleGitConfig(
|
|
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
|
fields_include="element_id",
|
|
),
|
|
)
|
|
isd_elems = ingest_doc.process_file()
|
|
|
|
for elem in isd_elems:
|
|
assert {"element_id"} == set(elem.keys())
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
def test_process_file_flatten_metadata_filename(filename: str):
|
|
ingest_doc = GitIngestDoc(
|
|
path=filename,
|
|
config=SimpleGitConfig(
|
|
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
|
metadata_include="filename",
|
|
flatten_metadata=True,
|
|
),
|
|
)
|
|
isd_elems = ingest_doc.process_file()
|
|
|
|
for elem in isd_elems:
|
|
assert {"element_id", "text", "type", "filename"} == set(elem.keys())
|
|
|
|
|
|
@pytest.mark.parametrize("filename", test_files)
|
|
def test_process_file_flatten_metadata_filename_pagenum(filename: str):
|
|
ingest_doc = GitIngestDoc(
|
|
path=filename,
|
|
config=SimpleGitConfig(
|
|
download_dir=EXAMPLE_DOCS_DIRECTORY,
|
|
metadata_include="filename,page_number",
|
|
flatten_metadata=True,
|
|
),
|
|
)
|
|
isd_elems = ingest_doc.process_file()
|
|
|
|
for elem in isd_elems:
|
|
assert {"element_id", "text", "type", "filename", "page_number"} == set(elem.keys())
|