From 4b827f07932c9c18df08e75a2615b01f0ab0bc74 Mon Sep 17 00:00:00 2001 From: Ahmet Melek <39141206+ahmetmeleq@users.noreply.github.com> Date: Wed, 5 Jul 2023 22:37:40 +0100 Subject: [PATCH] fix: local connector output filename when a single file is being processed (#879) * fix string processing error for _output_filename * Add docstring and type hint, update CHANGELOG, update version * update test fixture * simple code change commit to retrigger ci checks * update test fixture - after brew install tesseract-lang * Update ingest test fixtures (#882) Co-authored-by: ahmetmeleq * correct CHANGELOG * correct CHANGELOG --------- Co-authored-by: Unstructured-DevOps <111007769+Unstructured-DevOps@users.noreply.github.com> Co-authored-by: ahmetmeleq --- .pre-commit-config.yaml | 1 + CHANGELOG.md | 3 ++- .../english-and-korean.png.json | 0 unstructured/__version__.py | 2 +- unstructured/ingest/connector/local.py | 15 +++++++++++---- 5 files changed, 15 insertions(+), 6 deletions(-) rename test_unstructured_ingest/expected-structured-output/local-single-file/{example-docs => }/english-and-korean.png.json (100%) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bf1e2fe3d..88da6a35c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,6 +8,7 @@ repos: - id: check-json - id: check-xml - id: end-of-file-fixer + exclude: \.json$ include: \.py$ - id: trailing-whitespace - id: mixed-line-ending diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d569477b..2331647ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.8.0-dev0 +## 0.8.0-dev1 ### Enhancements @@ -8,6 +8,7 @@ ### Fixes * Fix KeyError when `isd_to_elements` doesn't find a type +* Fix _output_filename for local connector, allowing single files to be written correctly to the disk ### BREAKING CHANGES diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/english-and-korean.png.json b/test_unstructured_ingest/expected-structured-output/local-single-file/english-and-korean.png.json similarity index 100% rename from test_unstructured_ingest/expected-structured-output/local-single-file/example-docs/english-and-korean.png.json rename to test_unstructured_ingest/expected-structured-output/local-single-file/english-and-korean.png.json diff --git a/unstructured/__version__.py b/unstructured/__version__.py index b41f41048..5cee9d168 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.8.0-dev0" # pragma: no cover +__version__ = "0.8.0-dev1" # pragma: no cover diff --git a/unstructured/ingest/connector/local.py b/unstructured/ingest/connector/local.py index b2d5c8000..6d4901e1e 100644 --- a/unstructured/ingest/connector/local.py +++ b/unstructured/ingest/connector/local.py @@ -51,11 +51,18 @@ class LocalIngestDoc(BaseIngestDoc): pass @property - def _output_filename(self): - return ( - Path(self.standard_config.output_dir) - / f"{self.path.replace(f'{self.config.input_path}/', '')}.json" + def _output_filename(self) -> Path: + """Returns output filename for the doc + If input path argument is a file itself, it returns the filename of the doc. + If input path argument is a folder, it returns the relative path of the doc. + """ + input_path = Path(self.config.input_path) + basename = ( + f"{Path(self.path).name}.json" + if input_path.is_file() + else f"{Path(self.path).relative_to(input_path)}.json" ) + return Path(self.standard_config.output_dir) / basename class LocalConnector(BaseConnector):