mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 07:03:52 +00:00
fix: revert dropping of filename extension for some connectors (#3109)
V2 refactor of ingest code introduces the removal of original file extensions. Since the upgrade of connectors is incomplete this means that some connectors will remove the original file extension and some will not. Still TBD whether this is actually something we want at all. This PR reverts specifically that change in the V2 ingest code so that original file extension is preserved downstream. ## Testing CI is passing with filenames updated via `Ingest Test Fixtures Update` workflow. --------- Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
This commit is contained in:
parent
f4457249a7
commit
6b5d8a9785
10
CHANGELOG.md
10
CHANGELOG.md
@ -1,3 +1,13 @@
|
||||
## 0.14.4-dev
|
||||
|
||||
### Enhancements
|
||||
|
||||
### Features
|
||||
|
||||
### Fixes
|
||||
|
||||
* **Ingest preserves original file extension** Ingest V2 introduced a change that dropped the original extension for upgraded connectors. This reverts that change.
|
||||
|
||||
## 0.14.3
|
||||
|
||||
### Enhancements
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.14.3" # pragma: no cover
|
||||
__version__ = "0.14.4-dev" # pragma: no cover
|
||||
|
||||
@ -43,7 +43,7 @@ class ChunkStep(PipelineStep):
|
||||
return False
|
||||
|
||||
def get_output_filepath(self, filename: Path) -> Path:
|
||||
hashed_output_file = f"{self.get_hash(extras=[filename.stem])}.json"
|
||||
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
|
||||
filepath = (self.cache_dir / hashed_output_file).resolve()
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
return filepath
|
||||
|
||||
@ -43,7 +43,7 @@ class EmbedStep(PipelineStep):
|
||||
return False
|
||||
|
||||
def get_output_filepath(self, filename: Path) -> Path:
|
||||
hashed_output_file = f"{self.get_hash(extras=[filename.stem])}.json"
|
||||
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
|
||||
filepath = (self.cache_dir / hashed_output_file).resolve()
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
return filepath
|
||||
|
||||
@ -38,7 +38,7 @@ class PartitionStep(PipelineStep):
|
||||
return False
|
||||
|
||||
def get_output_filepath(self, filename: Path) -> Path:
|
||||
hashed_output_file = f"{self.get_hash(extras=[filename.stem])}.json"
|
||||
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
|
||||
filepath = (self.cache_dir / hashed_output_file).resolve()
|
||||
filepath.parent.mkdir(parents=True, exist_ok=True)
|
||||
return filepath
|
||||
|
||||
@ -308,7 +308,7 @@ class FsspecUploader(Uploader):
|
||||
Path(self.upload_config.path_without_protocol)
|
||||
/ file_data.source_identifiers.relative_path
|
||||
)
|
||||
updated_upload_path = upload_path.parent / f"{upload_path.stem}.json"
|
||||
updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
|
||||
upload_path_str = str(updated_upload_path)
|
||||
path_str = str(path.resolve())
|
||||
if self.fs.exists(path=upload_path_str) and not self.upload_config.overwrite:
|
||||
|
||||
@ -149,9 +149,7 @@ class LocalUploader(Uploader):
|
||||
for content in contents:
|
||||
identifiers = content.file_data.source_identifiers
|
||||
new_path = self.upload_config.output_path / identifiers.relative_path
|
||||
final_path = str(new_path).replace(
|
||||
identifiers.filename, f"{identifiers.filename_stem}.json"
|
||||
)
|
||||
final_path = str(new_path).replace(identifiers.filename, f"{identifiers.filename}.json")
|
||||
Path(final_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
logger.debug(f"copying file from {content.path} to {final_path}")
|
||||
shutil.copy(src=str(content.path), dst=str(final_path))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user