2023-01-09 16:15:14 -05:00
|
|
|
from typing import IO, Optional
|
|
|
|
|
|
|
|
from unstructured.file_utils.filetype import detect_filetype, FileType
|
|
|
|
from unstructured.partition.docx import partition_docx
|
|
|
|
from unstructured.partition.email import partition_email
|
|
|
|
from unstructured.partition.html import partition_html
|
|
|
|
from unstructured.partition.pdf import partition_pdf
|
2023-01-23 12:03:09 -05:00
|
|
|
from unstructured.partition.pptx import partition_pptx
|
2023-01-13 22:24:13 -06:00
|
|
|
from unstructured.partition.image import partition_image
|
2023-01-13 16:39:53 -05:00
|
|
|
from unstructured.partition.text import partition_text
|
2023-01-09 16:15:14 -05:00
|
|
|
|
|
|
|
|
|
|
|
def partition(filename: Optional[str] = None, file: Optional[IO] = None):
|
|
|
|
"""Partitions a document into its constituent elements. Will use libmagic to determine
|
|
|
|
the file's type and route it to the appropriate partitioning function. Applies the default
|
|
|
|
parameters for each partitioning function. Use the document-type specific partitioning
|
|
|
|
functions if you need access to additional kwarg options.
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
filename
|
|
|
|
A string defining the target filename path.
|
|
|
|
file
|
|
|
|
A file-like object using "rb" mode --> open(filename, "rb").
|
|
|
|
"""
|
|
|
|
filetype = detect_filetype(filename=filename, file=file)
|
|
|
|
|
|
|
|
if file is not None:
|
|
|
|
file.seek(0)
|
|
|
|
|
|
|
|
if filetype == FileType.DOCX:
|
|
|
|
return partition_docx(filename=filename, file=file)
|
|
|
|
elif filetype == FileType.EML:
|
|
|
|
return partition_email(filename=filename, file=file)
|
|
|
|
elif filetype == FileType.HTML:
|
|
|
|
return partition_html(filename=filename, file=file)
|
|
|
|
elif filetype == FileType.PDF:
|
|
|
|
return partition_pdf(filename=filename, file=file, url=None) # type: ignore
|
2023-01-13 22:24:13 -06:00
|
|
|
elif (filetype == FileType.PNG) or (filetype == FileType.JPG):
|
|
|
|
return partition_image(filename=filename, file=file, url=None) # type: ignore
|
2023-01-13 16:39:53 -05:00
|
|
|
elif filetype == FileType.TXT:
|
|
|
|
return partition_text(filename=filename, file=file)
|
2023-01-23 12:03:09 -05:00
|
|
|
elif filetype == FileType.PPTX:
|
|
|
|
return partition_pptx(filename=filename, file=file)
|
2023-01-09 16:15:14 -05:00
|
|
|
else:
|
|
|
|
msg = "Invalid file" if not filename else f"Invalid file {filename}"
|
|
|
|
raise ValueError(f"{msg}. File type not support in partition.")
|