mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	refactor: move processing logic to IngestDoc (#248)
Moves the logic to partition a raw document to the IngestDoc level to allow for easier overrides for subclasses of IngestDoc.
This commit is contained in:
		
							parent
							
								
									69acb083bd
								
							
						
					
					
						commit
						1b8bf318b8
					
				@ -1,6 +1,6 @@
 | 
				
			|||||||
## 0.4.12-dev1
 | 
					## 0.4.12-dev1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
* Adds console_entrypoint for unstructured-ingest and more structure/docs related to ingest.
 | 
					* Adds console_entrypoint for unstructured-ingest, other structure/doc updates related to ingest.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## 0.4.11
 | 
					## 0.4.11
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -55,6 +55,7 @@ The `main.py` flags of --re-download/--no-re-download , --download-dir, --preser
 | 
				
			|||||||
In checklist form, the above steps are summarized as:
 | 
					In checklist form, the above steps are summarized as:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- [ ] Create a new module under [unstructured/ingest/connector/](unstructured/ingest/connector/) implementing the 3 abstract base classes, similar to [unstructured/ingest/connector/s3_connector.py](unstructured/ingest/connector/s3_connector.py).
 | 
					- [ ] Create a new module under [unstructured/ingest/connector/](unstructured/ingest/connector/) implementing the 3 abstract base classes, similar to [unstructured/ingest/connector/s3_connector.py](unstructured/ingest/connector/s3_connector.py).
 | 
				
			||||||
 | 
					  - [ ] The subclass of `BaseIngestDoc` overrides `process_file()` if extra processing logic is needed other than what is provided by [auto.partition()](unstructured/partition/auto.py).
 | 
				
			||||||
- [ ] Update [unstructured/ingest/main.py](unstructured/ingest/main.py) with support for the new connector.
 | 
					- [ ] Update [unstructured/ingest/main.py](unstructured/ingest/main.py) with support for the new connector.
 | 
				
			||||||
- [ ] Create a folder under [examples/ingest](examples/ingest) that includes at least one well documented script.
 | 
					- [ ] Create a folder under [examples/ingest](examples/ingest) that includes at least one well documented script.
 | 
				
			||||||
- [ ] Add a script test_unstructured_ingest/test-ingest-\<the-new-data-source\>.sh. It's json output files should have a total of no more than 100K.
 | 
					- [ ] Add a script test_unstructured_ingest/test-ingest-\<the-new-data-source\>.sh. It's json output files should have a total of no more than 100K.
 | 
				
			||||||
 | 
				
			|||||||
@ -107,12 +107,12 @@ class S3IngestDoc(BaseIngestDoc):
 | 
				
			|||||||
            print(f"fetching {self} - PID: {os.getpid()}")
 | 
					            print(f"fetching {self} - PID: {os.getpid()}")
 | 
				
			||||||
        s3_cli.download_file(self.config.s3_bucket, self.s3_key, self._tmp_download_file())
 | 
					        s3_cli.download_file(self.config.s3_bucket, self.s3_key, self._tmp_download_file())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def write_result(self, result):
 | 
					    def write_result(self):
 | 
				
			||||||
        """Write the structured json result for this doc. result must be json serializable."""
 | 
					        """Write the structured json result for this doc. result must be json serializable."""
 | 
				
			||||||
        output_filename = self._output_filename()
 | 
					        output_filename = self._output_filename()
 | 
				
			||||||
        output_filename.parent.mkdir(parents=True, exist_ok=True)
 | 
					        output_filename.parent.mkdir(parents=True, exist_ok=True)
 | 
				
			||||||
        with open(output_filename, "w") as output_f:
 | 
					        with open(output_filename, "w") as output_f:
 | 
				
			||||||
            output_f.write(json.dumps(result, ensure_ascii=False, indent=2))
 | 
					            output_f.write(json.dumps(self.isd_elems_no_filename, ensure_ascii=False, indent=2))
 | 
				
			||||||
        print(f"Wrote {output_filename}")
 | 
					        print(f"Wrote {output_filename}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @property
 | 
					    @property
 | 
				
			||||||
 | 
				
			|||||||
@ -2,9 +2,6 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from unstructured.partition.auto import partition
 | 
					 | 
				
			||||||
from unstructured.staging.base import convert_to_isd
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from unstructured_inference.models.detectron2 import MODEL_TYPES
 | 
					from unstructured_inference.models.detectron2 import MODEL_TYPES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -25,24 +22,12 @@ def process_document(doc):
 | 
				
			|||||||
        # in the future, get_file_handle() could also be supported
 | 
					        # in the future, get_file_handle() could also be supported
 | 
				
			||||||
        doc.get_file()
 | 
					        doc.get_file()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # accessing the .filename property could lazily call .get_file(), but
 | 
					        isd_elems_no_filename = doc.process_file()
 | 
				
			||||||
        # keeping them as two distinct calls for end-user transparency for now
 | 
					 | 
				
			||||||
        print(f"Processing {doc.filename}")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        elements = partition(filename=doc.filename)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        isd_elems = convert_to_isd(elements)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        isd_elems_no_filename = []
 | 
					 | 
				
			||||||
        for elem in isd_elems:
 | 
					 | 
				
			||||||
            # type: ignore
 | 
					 | 
				
			||||||
            elem["metadata"].pop("filename")  # type: ignore[attr-defined]
 | 
					 | 
				
			||||||
            isd_elems_no_filename.append(elem)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # Note, this may be a no-op if the IngestDoc doesn't do anything to persist
 | 
					        # Note, this may be a no-op if the IngestDoc doesn't do anything to persist
 | 
				
			||||||
        # the results. Instead, the MainProcess (caller) may work with the aggregate
 | 
					        # the results. Instead, the MainProcess (caller) may work with the aggregate
 | 
				
			||||||
        # results across all docs in memory.
 | 
					        # results across all docs in memory.
 | 
				
			||||||
        doc.write_result(isd_elems_no_filename)
 | 
					        doc.write_result()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    except Exception:
 | 
					    except Exception:
 | 
				
			||||||
        # TODO(crag) save the exception instead of print?
 | 
					        # TODO(crag) save the exception instead of print?
 | 
				
			||||||
 | 
				
			|||||||
@ -3,6 +3,9 @@ through Unstructured."""
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from abc import ABC, abstractmethod
 | 
					from abc import ABC, abstractmethod
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from unstructured.partition.auto import partition
 | 
				
			||||||
 | 
					from unstructured.staging.base import convert_to_isd
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BaseConnector(ABC):
 | 
					class BaseConnector(ABC):
 | 
				
			||||||
    """Abstract Base Class for a connector to a remote source, e.g. S3 or Google Drive."""
 | 
					    """Abstract Base Class for a connector to a remote source, e.g. S3 or Google Drive."""
 | 
				
			||||||
@ -80,6 +83,20 @@ class BaseIngestDoc(ABC):
 | 
				
			|||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @abstractmethod
 | 
					    @abstractmethod
 | 
				
			||||||
    def write_result(self, result):
 | 
					    def write_result(self):
 | 
				
			||||||
        """Write the structured json result for this doc. result must be json serializable."""
 | 
					        """Write the structured json result for this doc. result must be json serializable."""
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def process_file(self):
 | 
				
			||||||
 | 
					        print(f"Processing {self.filename}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        elements = partition(filename=self.filename)
 | 
				
			||||||
 | 
					        isd_elems = convert_to_isd(elements)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self.isd_elems_no_filename = []
 | 
				
			||||||
 | 
					        for elem in isd_elems:
 | 
				
			||||||
 | 
					            # type: ignore
 | 
				
			||||||
 | 
					            elem["metadata"].pop("filename")  # type: ignore[attr-defined]
 | 
				
			||||||
 | 
					            self.isd_elems_no_filename.append(elem)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return self.isd_elems_no_filename
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user