mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00

Reviewers: I recommend reviewing commit-by-commit or just looking at the final version of `partition/docx.py` as View File. This refactor solves a few problems but mostly lays the groundwork to allow us to refine further aspects such as page-break detection, list-item detection, and moving python-docx internals upstream to that library so our work doesn't depend on that domain-knowledge.
20 lines
633 B
Python
20 lines
633 B
Python
import os
|
|
import sys
|
|
|
|
from unstructured.partition.auto import partition
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) < 3:
|
|
print(
|
|
"Please provide the path to the file as the first argument and the strategy as the "
|
|
"second argument.",
|
|
)
|
|
sys.exit(1)
|
|
|
|
file_path = sys.argv[1]
|
|
strategy = sys.argv[2]
|
|
model_name = sys.argv[3] if len(sys.argv) > 3 else os.environ.get("PARTITION_MODEL_NAME")
|
|
result = partition(file_path, strategy=strategy, model_name=model_name)
|
|
# access element in the return value to make sure we got something back, otherwise error
|
|
result[1]
|