diff --git a/CHANGELOG.md b/CHANGELOG.md index 65a462ca8..98311ce13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ -## 0.10.17-dev14 +## 0.10.17-dev15 ### Enhancements -* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. +* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, Slack, and DeltaTable connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc. * **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits." * **Azure Cognite Search destination connector** New Azure Cognitive Search destination connector added to ingest CLI. Users may now use `unstructured-ingest` to write partitioned data from over 20 data sources (so far) to an Azure Cognitive Search index. * **Improves salesforce partitioning** Partitions Salesforce data as xlm instead of text for improved detail and flexibility. Partitions htmlbody instead of textbody for Salesforce emails. Importance: Allows all Salesforce fields to be ingested and gives Salesforce emails more detailed partitioning. diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.json index 1b2c921b8..3734bfb66 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-0.parquet", + "version": 264934223616864047145159629306912568989, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.604000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.json index 0f3e9b381..202391c12 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-1.parquet", + "version": 139732878514171884135017505553329458078, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.629000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.json index fc8e77b1e..e4004e051 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-2.parquet", + "version": 94569544647555135566266174719335103474, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.634000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.json index c30c96498..644a82922 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-3.parquet", + "version": 153924277850028657610430472976884166368, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.609000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.json index a89ae23fb..3c3059e5c 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-4.parquet", + "version": 106461216032689499284003671440554259965, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.599000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.json index 58aaf2090..37d698815 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-5.parquet", + "version": 164150003651878262139646734756859067992, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.614000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.json index 313dd687c..34bb7d179 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-6.parquet", + "version": 117019847084687446803154205344125897829, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.619000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.json index f63c3badd..9046006c9 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-7.parquet", + "version": 93578343538662480706683120160579695806, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.624000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.json index 05fd088f3..59f11ba85 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-8.parquet", + "version": 329407810704817028643559273505069222621, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.644000" }, diff --git a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.json b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.json index 3604829e1..82acb28ae 100644 --- a/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.json +++ b/test_unstructured_ingest/expected-structured-output/delta-table/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.json @@ -5,6 +5,7 @@ "metadata": { "data_source": { "url": "s3://utic-dev-tech-fixtures/sample-delta-lake-data/deltatable/0-9d594ee0-ad36-4e7e-a6be-f53975fe3d10-9.parquet", + "version": 127086160869624650753647884727730407942, "date_created": "2023-08-16 13:30:38.644000", "date_modified": "2023-08-16 13:30:38.639000" }, diff --git a/unstructured/__version__.py b/unstructured/__version__.py index f87f0c276..01785404e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.10.17-dev14" # pragma: no cover +__version__ = "0.10.17-dev15" # pragma: no cover diff --git a/unstructured/ingest/connector/delta_table.py b/unstructured/ingest/connector/delta_table.py index 92594cad2..f471b2b06 100644 --- a/unstructured/ingest/connector/delta_table.py +++ b/unstructured/ingest/connector/delta_table.py @@ -15,6 +15,7 @@ from unstructured.ingest.interfaces import ( BaseSourceConnector, IngestDocCleanupMixin, SourceConnectorCleanupMixin, + SourceMetadata, WriteConfig, ) from unstructured.ingest.logger import logger @@ -50,26 +51,10 @@ class DeltaTableIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): basename = os.path.basename(self.uri) return os.path.splitext(basename)[0] - @property - def source_url(self) -> t.Optional[str]: - """The url of the source document.""" - return self.uri - - @property - def date_created(self) -> t.Optional[str]: - """This is the creation time of the table itself, not the file or specific record""" - # TODO get creation time of file/record - return self.created_at - @property def filename(self): return (Path(self.read_config.download_dir) / f"{self.uri_filename()}.csv").resolve() - @property - def date_modified(self) -> t.Optional[str]: - """The date the document was last modified on the source system.""" - return self.modified_date - @property def _output_filename(self): """Create filename document id combined with a hash of the query to uniquely identify @@ -80,11 +65,8 @@ class DeltaTableIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): self.filename.parent.mkdir(parents=True, exist_ok=True) self._output_filename.parent.mkdir(parents=True, exist_ok=True) - @SourceConnectionError.wrap - @BaseIngestDoc.skip_if_file_exists @requires_dependencies(["fsspec"], extras="delta-table") - def get_file(self): - import pyarrow.parquet as pq + def _get_fs_from_uri(self): from fsspec.core import url_to_fs try: @@ -94,6 +76,29 @@ class DeltaTableIngestDoc(IngestDocCleanupMixin, BaseIngestDoc): f"uri {self.uri} may be associated with a filesystem that " f"requires additional dependencies: {error}", ) + return fs + + def update_source_metadata(self, **kwargs): + fs = kwargs.get("fs", self._get_fs_from_uri()) + version = ( + fs.checksum(self.uri) if fs.protocol != "gs" else fs.info(self.uri).get("etag", "") + ) + file_exists = fs.exists(self.uri) + self.source_metadata = SourceMetadata( + date_created=self.created_at, + date_modified=self.modified_date, + version=version, + source_url=self.uri, + exists=file_exists, + ) + + @SourceConnectionError.wrap + @BaseIngestDoc.skip_if_file_exists + def get_file(self): + import pyarrow.parquet as pq + + fs = self._get_fs_from_uri() + self.update_source_metadata(fs=fs) logger.info(f"using a {fs} filesystem to collect table data") self._create_full_tmp_dir_path() logger.debug(f"Fetching {self} - PID: {os.getpid()}")