2023-04-12 14:31:01 -04:00
|
|
|
import io
|
2023-04-13 11:04:15 -04:00
|
|
|
from typing import IO, Callable, Dict, Optional, Tuple
|
2023-04-12 14:31:01 -04:00
|
|
|
|
|
|
|
import requests
|
2023-01-09 16:15:14 -05:00
|
|
|
|
2023-06-07 21:22:18 -07:00
|
|
|
from unstructured.documents.elements import DataSourceMetadata
|
2023-05-15 13:23:19 -05:00
|
|
|
from unstructured.file_utils.filetype import (
|
|
|
|
FILETYPE_TO_MIMETYPE,
|
|
|
|
STR_TO_FILETYPE,
|
|
|
|
FileType,
|
|
|
|
detect_filetype,
|
|
|
|
)
|
2023-04-13 11:04:15 -04:00
|
|
|
from unstructured.logger import logger
|
2023-04-12 14:31:01 -04:00
|
|
|
from unstructured.partition.common import exactly_one
|
2023-05-19 15:57:42 -04:00
|
|
|
from unstructured.partition.csv import partition_csv
|
2023-02-17 09:30:23 -05:00
|
|
|
from unstructured.partition.doc import partition_doc
|
2023-01-09 16:15:14 -05:00
|
|
|
from unstructured.partition.docx import partition_docx
|
|
|
|
from unstructured.partition.email import partition_email
|
2023-03-14 11:52:21 -04:00
|
|
|
from unstructured.partition.epub import partition_epub
|
2023-01-09 16:15:14 -05:00
|
|
|
from unstructured.partition.html import partition_html
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured.partition.image import partition_image
|
2023-03-09 03:36:01 +09:00
|
|
|
from unstructured.partition.json import partition_json
|
2023-02-27 23:36:44 +01:00
|
|
|
from unstructured.partition.md import partition_md
|
2023-03-28 16:15:22 -04:00
|
|
|
from unstructured.partition.msg import partition_msg
|
2023-05-04 15:28:08 -04:00
|
|
|
from unstructured.partition.odt import partition_odt
|
2023-06-23 20:45:31 +02:00
|
|
|
from unstructured.partition.org import partition_org
|
2023-01-09 16:15:14 -05:00
|
|
|
from unstructured.partition.pdf import partition_pdf
|
2023-02-17 11:57:08 -05:00
|
|
|
from unstructured.partition.ppt import partition_ppt
|
2023-01-23 12:03:09 -05:00
|
|
|
from unstructured.partition.pptx import partition_pptx
|
2023-06-12 15:31:10 -04:00
|
|
|
from unstructured.partition.rst import partition_rst
|
2023-04-10 17:25:03 -04:00
|
|
|
from unstructured.partition.rtf import partition_rtf
|
2023-01-13 16:39:53 -05:00
|
|
|
from unstructured.partition.text import partition_text
|
2023-06-15 13:50:53 -05:00
|
|
|
from unstructured.partition.tsv import partition_tsv
|
2023-05-16 15:40:40 -04:00
|
|
|
from unstructured.partition.xlsx import partition_xlsx
|
2023-05-18 11:40:12 -04:00
|
|
|
from unstructured.partition.xml import partition_xml
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
2023-02-08 10:11:15 -05:00
|
|
|
def partition(
|
|
|
|
filename: Optional[str] = None,
|
2023-03-24 16:32:45 -07:00
|
|
|
content_type: Optional[str] = None,
|
2023-02-08 10:11:15 -05:00
|
|
|
file: Optional[IO] = None,
|
2023-03-24 16:32:45 -07:00
|
|
|
file_filename: Optional[str] = None,
|
2023-04-12 14:31:01 -04:00
|
|
|
url: Optional[str] = None,
|
2023-02-08 10:11:15 -05:00
|
|
|
include_page_breaks: bool = False,
|
2023-05-15 15:26:53 -04:00
|
|
|
strategy: str = "auto",
|
2023-06-05 11:27:12 -07:00
|
|
|
encoding: Optional[str] = None,
|
2023-04-06 14:35:22 -04:00
|
|
|
paragraph_grouper: Optional[Callable[[str], str]] = None,
|
2023-04-13 11:04:15 -04:00
|
|
|
headers: Dict[str, str] = {},
|
2023-04-20 11:13:56 -04:00
|
|
|
ssl_verify: bool = True,
|
2023-04-21 09:41:26 -04:00
|
|
|
ocr_languages: str = "eng",
|
2023-04-21 13:48:19 -05:00
|
|
|
pdf_infer_table_structure: bool = False,
|
2023-05-18 11:40:12 -04:00
|
|
|
xml_keep_tags: bool = False,
|
2023-06-07 21:22:18 -07:00
|
|
|
data_source_metadata: Optional[DataSourceMetadata] = None,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-02-08 10:11:15 -05:00
|
|
|
):
|
2023-01-09 16:15:14 -05:00
|
|
|
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
|
|
|
the file's type and route it to the appropriate partitioning function. Applies the default
|
|
|
|
parameters for each partitioning function. Use the document-type specific partitioning
|
|
|
|
functions if you need access to additional kwarg options.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
2023-04-05 10:54:11 -07:00
|
|
|
filename
|
2023-01-09 16:15:14 -05:00
|
|
|
A string defining the target filename path.
|
2023-03-24 16:32:45 -07:00
|
|
|
content_type
|
|
|
|
A string defining the file content in MIME type
|
2023-01-09 16:15:14 -05:00
|
|
|
file
|
|
|
|
A file-like object using "rb" mode --> open(filename, "rb").
|
2023-03-24 16:32:45 -07:00
|
|
|
file_filename
|
|
|
|
When file is not None, the filename (string) to store in element metadata. E.g. "foo.txt"
|
2023-04-12 14:31:01 -04:00
|
|
|
url
|
|
|
|
The url for a remote document. Pass in content_type if you want partition to treat
|
|
|
|
the document as a specific content_type.
|
2023-02-08 10:11:15 -05:00
|
|
|
include_page_breaks
|
|
|
|
If True, the output will include page breaks if the filetype supports it
|
2023-03-10 22:16:05 -05:00
|
|
|
strategy
|
2023-06-09 13:54:18 -04:00
|
|
|
The strategy to use for partitioning PDF/image. Uses a layout detection model if set
|
|
|
|
to 'hi_res', otherwise partition simply extracts the text from the document
|
2023-03-10 22:16:05 -05:00
|
|
|
and processes it.
|
|
|
|
encoding
|
|
|
|
The encoding method used to decode the text input. If None, utf-8 will be used.
|
2023-04-13 11:04:15 -04:00
|
|
|
headers
|
|
|
|
The headers to be used in conjunction with the HTTP request if URL is set.
|
2023-04-20 11:13:56 -04:00
|
|
|
ssl_verify
|
|
|
|
If the URL parameter is set, determines whether or not partition uses SSL verification
|
|
|
|
in the HTTP request.
|
2023-04-21 09:41:26 -04:00
|
|
|
ocr_languages
|
|
|
|
The languages to use for the Tesseract agent. To use a language, you'll first need
|
|
|
|
to isntall the appropriate Tesseract language pack.
|
2023-04-21 13:48:19 -05:00
|
|
|
pdf_infer_table_structure
|
|
|
|
If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
|
|
|
|
additional metadata field, "text_as_html," where the value (string) is a just a
|
|
|
|
transformation of the data into an HTML <table>.
|
|
|
|
The "text" field for a partitioned Table Element is always present, whether True or False.
|
2023-05-18 11:40:12 -04:00
|
|
|
xml_keep_tags
|
|
|
|
If True, will retain the XML tags in the output. Otherwise it will simply extract
|
|
|
|
the text from within the tags. Only applies to partition_xml.
|
2023-01-09 16:15:14 -05:00
|
|
|
"""
|
2023-04-12 14:31:01 -04:00
|
|
|
exactly_one(file=file, filename=filename, url=url)
|
|
|
|
|
|
|
|
if url is not None:
|
2023-04-13 11:04:15 -04:00
|
|
|
file, filetype = file_and_type_from_url(
|
|
|
|
url=url,
|
|
|
|
content_type=content_type,
|
|
|
|
headers=headers,
|
2023-04-20 11:13:56 -04:00
|
|
|
ssl_verify=ssl_verify,
|
2023-04-13 11:04:15 -04:00
|
|
|
)
|
2023-04-12 14:31:01 -04:00
|
|
|
else:
|
2023-04-13 11:04:15 -04:00
|
|
|
if headers != {}:
|
|
|
|
logger.warning(
|
|
|
|
"The headers kwarg is set but the url kwarg is not. "
|
|
|
|
"The headers kwarg will be ignored.",
|
|
|
|
)
|
2023-04-12 14:31:01 -04:00
|
|
|
filetype = detect_filetype(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
file_filename=file_filename,
|
|
|
|
content_type=content_type,
|
2023-06-09 11:08:16 -04:00
|
|
|
encoding=encoding,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
if file is not None:
|
|
|
|
file.seek(0)
|
|
|
|
|
2023-02-17 09:30:23 -05:00
|
|
|
if filetype == FileType.DOC:
|
2023-06-20 13:07:30 -04:00
|
|
|
elements = partition_doc(filename=filename, file=file, **kwargs)
|
2023-04-12 14:31:01 -04:00
|
|
|
elif filetype == FileType.DOCX:
|
2023-06-20 13:07:30 -04:00
|
|
|
elements = partition_docx(filename=filename, file=file, **kwargs)
|
2023-05-04 15:28:08 -04:00
|
|
|
elif filetype == FileType.ODT:
|
2023-06-20 13:07:30 -04:00
|
|
|
elements = partition_odt(filename=filename, file=file, **kwargs)
|
2023-01-09 16:15:14 -05:00
|
|
|
elif filetype == FileType.EML:
|
2023-06-20 13:07:30 -04:00
|
|
|
elements = partition_email(filename=filename, file=file, encoding=encoding, **kwargs)
|
2023-03-28 16:15:22 -04:00
|
|
|
elif filetype == FileType.MSG:
|
2023-06-20 13:07:30 -04:00
|
|
|
elements = partition_msg(filename=filename, file=file, **kwargs)
|
2023-01-09 16:15:14 -05:00
|
|
|
elif filetype == FileType.HTML:
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_html(
|
2023-03-10 22:16:05 -05:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
|
|
|
encoding=encoding,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-03-10 22:16:05 -05:00
|
|
|
)
|
2023-05-18 11:40:12 -04:00
|
|
|
elif filetype == FileType.XML:
|
|
|
|
elements = partition_xml(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
encoding=encoding,
|
|
|
|
xml_keep_tags=xml_keep_tags,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-05-18 11:40:12 -04:00
|
|
|
)
|
2023-03-14 11:52:21 -04:00
|
|
|
elif filetype == FileType.EPUB:
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_epub(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-06-23 20:45:31 +02:00
|
|
|
elif filetype == FileType.ORG:
|
|
|
|
elements = partition_org(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
|
|
|
**kwargs,
|
|
|
|
)
|
2023-06-12 15:31:10 -04:00
|
|
|
elif filetype == FileType.RST:
|
|
|
|
elements = partition_rst(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-06-12 15:31:10 -04:00
|
|
|
)
|
2023-02-27 23:36:44 +01:00
|
|
|
elif filetype == FileType.MD:
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_md(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-01-09 16:15:14 -05:00
|
|
|
elif filetype == FileType.PDF:
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_pdf(
|
2023-02-08 10:11:15 -05:00
|
|
|
filename=filename, # type: ignore
|
|
|
|
file=file, # type: ignore
|
|
|
|
url=None,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-04-21 13:48:19 -05:00
|
|
|
infer_table_structure=pdf_infer_table_structure,
|
2023-03-10 22:16:05 -05:00
|
|
|
strategy=strategy,
|
2023-04-21 09:41:26 -04:00
|
|
|
ocr_languages=ocr_languages,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-02-08 10:11:15 -05:00
|
|
|
)
|
2023-01-13 22:24:13 -06:00
|
|
|
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_image(
|
2023-02-08 10:11:15 -05:00
|
|
|
filename=filename, # type: ignore
|
|
|
|
file=file, # type: ignore
|
|
|
|
url=None,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-06-09 13:54:18 -04:00
|
|
|
strategy=strategy,
|
2023-04-21 09:41:26 -04:00
|
|
|
ocr_languages=ocr_languages,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-02-08 10:11:15 -05:00
|
|
|
)
|
2023-01-13 16:39:53 -05:00
|
|
|
elif filetype == FileType.TXT:
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_text(
|
2023-04-06 14:35:22 -04:00
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
encoding=encoding,
|
|
|
|
paragraph_grouper=paragraph_grouper,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-06 14:35:22 -04:00
|
|
|
)
|
2023-04-10 17:25:03 -04:00
|
|
|
elif filetype == FileType.RTF:
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_rtf(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-02-17 11:57:08 -05:00
|
|
|
elif filetype == FileType.PPT:
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_ppt(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-01-23 12:03:09 -05:00
|
|
|
elif filetype == FileType.PPTX:
|
2023-04-12 14:31:01 -04:00
|
|
|
elements = partition_pptx(
|
|
|
|
filename=filename,
|
|
|
|
file=file,
|
|
|
|
include_page_breaks=include_page_breaks,
|
2023-06-20 13:07:30 -04:00
|
|
|
**kwargs,
|
2023-04-12 14:31:01 -04:00
|
|
|
)
|
2023-03-09 03:36:01 +09:00
|
|
|
elif filetype == FileType.JSON:
|
2023-06-20 13:07:30 -04:00
|
|
|
elements = partition_json(filename=filename, file=file, **kwargs)
|
2023-05-26 01:55:32 -07:00
|
|
|
elif (filetype == FileType.XLSX) or (filetype == FileType.XLS):
|
2023-06-20 13:07:30 -04:00
|
|
|
elements = partition_xlsx(filename=filename, file=file, **kwargs)
|
2023-05-19 15:57:42 -04:00
|
|
|
elif filetype == FileType.CSV:
|
2023-06-20 13:07:30 -04:00
|
|
|
elements = partition_csv(filename=filename, file=file, **kwargs)
|
2023-06-15 13:50:53 -05:00
|
|
|
elif filetype == FileType.TSV:
|
2023-06-20 13:07:30 -04:00
|
|
|
elements = partition_tsv(filename=filename, file=file, **kwargs)
|
2023-06-09 16:07:50 -04:00
|
|
|
elif filetype == FileType.EMPTY:
|
|
|
|
elements = []
|
2023-01-09 16:15:14 -05:00
|
|
|
else:
|
|
|
|
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
2023-03-06 16:08:10 -08:00
|
|
|
raise ValueError(f"{msg}. The {filetype} file type is not supported in partition.")
|
2023-04-12 14:31:01 -04:00
|
|
|
|
|
|
|
for element in elements:
|
|
|
|
element.metadata.url = url
|
2023-06-07 21:22:18 -07:00
|
|
|
element.metadata.data_source = data_source_metadata
|
2023-05-15 13:23:19 -05:00
|
|
|
if content_type is not None:
|
|
|
|
out_filetype = STR_TO_FILETYPE.get(content_type)
|
|
|
|
element.metadata.filetype = (
|
|
|
|
FILETYPE_TO_MIMETYPE[out_filetype] if out_filetype is not None else None
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
element.metadata.filetype = FILETYPE_TO_MIMETYPE[filetype]
|
2023-04-12 14:31:01 -04:00
|
|
|
|
|
|
|
return elements
|
|
|
|
|
|
|
|
|
|
|
|
def file_and_type_from_url(
|
|
|
|
url: str,
|
|
|
|
content_type: Optional[str] = None,
|
2023-04-13 11:04:15 -04:00
|
|
|
headers: Dict[str, str] = {},
|
2023-04-20 11:13:56 -04:00
|
|
|
ssl_verify: bool = True,
|
2023-04-12 14:31:01 -04:00
|
|
|
) -> Tuple[io.BytesIO, Optional[FileType]]:
|
2023-04-20 11:13:56 -04:00
|
|
|
response = requests.get(url, headers=headers, verify=ssl_verify)
|
2023-04-12 14:31:01 -04:00
|
|
|
file = io.BytesIO(response.content)
|
|
|
|
|
|
|
|
content_type = content_type or response.headers.get("Content-Type")
|
2023-06-09 11:08:16 -04:00
|
|
|
encoding = response.headers.get("Content-Encoding", "utf-8")
|
|
|
|
|
|
|
|
filetype = detect_filetype(file=file, content_type=content_type, encoding=encoding)
|
2023-04-12 14:31:01 -04:00
|
|
|
return file, filetype
|