fix: revert dropping of filename extension for some connectors (#3109)

V2 refactor of ingest code introduces the removal of original file
extensions. Since the upgrade of connectors is incomplete this means
that some connectors will remove the original file extension and some
will not. Still TBD whether this is actually something we want at all.

This PR reverts specifically that change in the V2 ingest code so that
original file extension is preserved downstream.

## Testing
CI is passing with filenames updated via `Ingest Test Fixtures Update`
workflow.

---------

Co-authored-by: ryannikolaidis <ryannikolaidis@users.noreply.github.com>
This commit is contained in:
ryannikolaidis 2024-05-29 12:14:22 -07:00 committed by GitHub
parent f4457249a7
commit 6b5d8a9785
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
30 changed files with 16 additions and 8 deletions

View File

@ -1,3 +1,13 @@
## 0.14.4-dev
### Enhancements
### Features
### Fixes
* **Ingest preserves original file extension** Ingest V2 introduced a change that dropped the original extension for upgraded connectors. This reverts that change.
## 0.14.3
### Enhancements

View File

@ -1 +1 @@
__version__ = "0.14.3" # pragma: no cover
__version__ = "0.14.4-dev" # pragma: no cover

View File

@ -43,7 +43,7 @@ class ChunkStep(PipelineStep):
return False
def get_output_filepath(self, filename: Path) -> Path:
hashed_output_file = f"{self.get_hash(extras=[filename.stem])}.json"
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
filepath = (self.cache_dir / hashed_output_file).resolve()
filepath.parent.mkdir(parents=True, exist_ok=True)
return filepath

View File

@ -43,7 +43,7 @@ class EmbedStep(PipelineStep):
return False
def get_output_filepath(self, filename: Path) -> Path:
hashed_output_file = f"{self.get_hash(extras=[filename.stem])}.json"
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
filepath = (self.cache_dir / hashed_output_file).resolve()
filepath.parent.mkdir(parents=True, exist_ok=True)
return filepath

View File

@ -38,7 +38,7 @@ class PartitionStep(PipelineStep):
return False
def get_output_filepath(self, filename: Path) -> Path:
hashed_output_file = f"{self.get_hash(extras=[filename.stem])}.json"
hashed_output_file = f"{self.get_hash(extras=[filename.name])}.json"
filepath = (self.cache_dir / hashed_output_file).resolve()
filepath.parent.mkdir(parents=True, exist_ok=True)
return filepath

View File

@ -308,7 +308,7 @@ class FsspecUploader(Uploader):
Path(self.upload_config.path_without_protocol)
/ file_data.source_identifiers.relative_path
)
updated_upload_path = upload_path.parent / f"{upload_path.stem}.json"
updated_upload_path = upload_path.parent / f"{upload_path.name}.json"
upload_path_str = str(updated_upload_path)
path_str = str(path.resolve())
if self.fs.exists(path=upload_path_str) and not self.upload_config.overwrite:

View File

@ -149,9 +149,7 @@ class LocalUploader(Uploader):
for content in contents:
identifiers = content.file_data.source_identifiers
new_path = self.upload_config.output_path / identifiers.relative_path
final_path = str(new_path).replace(
identifiers.filename, f"{identifiers.filename_stem}.json"
)
final_path = str(new_path).replace(identifiers.filename, f"{identifiers.filename}.json")
Path(final_path).parent.mkdir(parents=True, exist_ok=True)
logger.debug(f"copying file from {content.path} to {final_path}")
shutil.copy(src=str(content.path), dst=str(final_path))