Feat/1332 save embedded images in pdf (#1371)

Addresses
[#1332](https://github.com/Unstructured-IO/unstructured/issues/1332)
with `unstructured-inference` PR
[#208](https://github.com/Unstructured-IO/unstructured-inference/pull/208).
### Summary
- Add `image_path` to element metadata
- Pass parameters related to extracting images in PDF
- Preserve image elements ignored due to garbage text if
`el.metadata.image_path` is `True`
### Testing


from unstructured.partition.pdf import partition_pdf

f_path = "example-docs/embedded-images.pdf"

# default image output directory
elements = partition_pdf(
    f_path,
    strategy=strategy,
    extract_images_in_pdf=True,
)

# specific image output directory
elements = partition_pdf(
    f_path,
    strategy=strategy,
    extract_images_in_pdf=True,
    image_output_dir_path=<directory path>,
)
This commit is contained in:
Christine Straub 2023-09-22 02:16:03 -07:00 committed by GitHub
parent 92ad7698fb
commit 2d951722df
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
35 changed files with 1703 additions and 2328 deletions

View File

@ -3,6 +3,7 @@
### Enhancements
* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
* **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits."
### Features

View File

@ -105,5 +105,5 @@ urllib3==1.26.16
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
zipp==3.16.2
zipp==3.17.0
# via importlib-metadata

Binary file not shown.

View File

@ -50,7 +50,7 @@ tabulate==0.9.0
# via -r requirements/base.in
tqdm==4.66.1
# via nltk
typing-extensions==4.7.1
typing-extensions==4.8.0
# via typing-inspect
typing-inspect==0.9.0
# via dataclasses-json

View File

@ -105,5 +105,5 @@ urllib3==1.26.16
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
zipp==3.16.2
zipp==3.17.0
# via importlib-metadata

View File

@ -23,7 +23,7 @@ IPython<8.13
# AttributeError: 'ResourcePath' object has no attribute 'collection'
Office365-REST-Python-Client<2.4.3
# NOTE(christine) Pinned to set the `unstructured-inference` version
unstructured-inference==0.5.28
unstructured-inference==0.5.31
# NOTE(klaijan) - Moved pin from test.in
# pinning to avoid error in argilla library
pydantic<2
@ -34,3 +34,5 @@ unstructured.pytesseract>=0.3.12
weaviate-client==3.23.2
# Note(yuming) - pining to avoid conflict with paddle install
matplotlib==3.7.2
# NOTE(crag) - pin to available pandas for python 3.8 (at least in CI)
pandas<2.1.1

View File

@ -69,9 +69,7 @@ defusedxml==0.7.1
distlib==0.3.7
# via virtualenv
exceptiongroup==1.1.3
# via
# -c requirements/test.txt
# anyio
# via anyio
executing==1.2.0
# via stack-data
fastjsonschema==2.18.0
@ -97,7 +95,7 @@ importlib-metadata==6.8.0
# jupyterlab
# jupyterlab-server
# nbconvert
importlib-resources==6.0.1
importlib-resources==6.1.0
# via
# jsonschema
# jsonschema-specifications
@ -133,7 +131,7 @@ json5==0.9.14
# via jupyterlab-server
jsonpointer==2.4
# via jsonschema
jsonschema[format-nongpl]==4.19.0
jsonschema[format-nongpl]==4.19.1
# via
# jupyter-events
# jupyterlab-server
@ -211,7 +209,7 @@ nest-asyncio==1.5.8
# via ipykernel
nodeenv==1.8.0
# via pre-commit
notebook==7.0.3
notebook==7.0.4
# via jupyter
notebook-shim==0.2.3
# via
@ -346,7 +344,6 @@ tinycss2==1.2.1
# via nbconvert
tomli==2.0.1
# via
# -c requirements/test.txt
# build
# jupyterlab
# pip-tools
@ -376,7 +373,7 @@ traitlets==5.10.0
# nbconvert
# nbformat
# qtconsole
typing-extensions==4.7.1
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# -c requirements/test.txt
@ -408,7 +405,7 @@ wheel==0.41.2
# pip-tools
widgetsnbextension==4.0.9
# via ipywidgets
zipp==3.16.2
zipp==3.17.0
# via
# importlib-metadata
# importlib-resources

View File

@ -9,7 +9,9 @@ numpy==1.24.4
# -c requirements/constraints.in
# pandas
pandas==2.0.3
# via -r requirements/extra-csv.in
# via
# -c requirements/constraints.in
# -r requirements/extra-csv.in
python-dateutil==2.8.2
# via pandas
pytz==2023.3.post1

View File

@ -8,5 +8,5 @@ importlib-metadata==6.8.0
# via markdown
markdown==3.4.4
# via -r requirements/extra-markdown.in
zipp==3.16.2
zipp==3.17.0
# via importlib-metadata

View File

@ -61,7 +61,7 @@ imgaug==0.4.0
# via unstructured-paddleocr
importlib-metadata==6.8.0
# via flask
importlib-resources==6.0.1
importlib-resources==6.1.0
# via matplotlib
itsdangerous==2.1.2
# via flask
@ -125,7 +125,9 @@ packaging==23.1
# scikit-image
# visualdl
pandas==2.0.3
# via visualdl
# via
# -c requirements/constraints.in
# visualdl
pdf2image==1.16.3
# via unstructured-paddleocr
pillow==10.0.1
@ -167,7 +169,7 @@ pywavelets==1.4.1
# via scikit-image
rapidfuzz==3.3.0
# via unstructured-paddleocr
rarfile==4.0
rarfile==4.1
# via visualdl
requests==2.31.0
# via
@ -213,7 +215,7 @@ visualdl==2.5.3
# via unstructured-paddleocr
werkzeug==2.3.7
# via flask
zipp==3.16.2
zipp==3.17.0
# via
# importlib-metadata
# importlib-resources

View File

@ -22,7 +22,7 @@ coloredlogs==15.0.1
# via onnxruntime
contourpy==1.1.1
# via matplotlib
cryptography==41.0.3
cryptography==41.0.4
# via pdfminer-six
cycler==0.11.0
# via matplotlib
@ -39,7 +39,7 @@ fonttools==4.42.1
# via matplotlib
fsspec==2023.9.1
# via huggingface-hub
huggingface-hub==0.17.1
huggingface-hub==0.17.2
# via
# timm
# transformers
@ -50,7 +50,7 @@ idna==3.4
# via
# -c requirements/base.txt
# requests
importlib-resources==6.0.1
importlib-resources==6.1.0
# via matplotlib
iopath==0.1.10
# via layoutparser
@ -88,7 +88,7 @@ omegaconf==2.3.0
# via effdet
onnx==1.14.1
# via unstructured-inference
onnxruntime==1.15.1
onnxruntime==1.16.0
# via unstructured-inference
opencv-python==4.8.0.76
# via
@ -104,7 +104,9 @@ packaging==23.1
# transformers
# unstructured-pytesseract
pandas==2.0.3
# via layoutparser
# via
# -c requirements/constraints.in
# layoutparser
pdf2image==1.16.3
# via
# -r requirements/extra-pdf-image.in
@ -158,6 +160,8 @@ pyyaml==6.0.1
# omegaconf
# timm
# transformers
rapidfuzz==3.3.0
# via unstructured-inference
regex==2023.8.8
# via
# -c requirements/base.txt
@ -206,7 +210,7 @@ tqdm==4.66.1
# transformers
transformers==4.33.2
# via unstructured-inference
typing-extensions==4.7.1
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# huggingface-hub
@ -215,7 +219,7 @@ typing-extensions==4.7.1
# torch
tzdata==2023.3
# via pandas
unstructured-inference==0.5.28
unstructured-inference==0.5.31
# via
# -c requirements/constraints.in
# -r requirements/extra-pdf-image.in
@ -228,5 +232,5 @@ urllib3==1.26.16
# -c requirements/base.txt
# -c requirements/constraints.in
# requests
zipp==3.16.2
zipp==3.17.0
# via importlib-resources

View File

@ -10,5 +10,5 @@ pillow==10.0.1
# via python-pptx
python-pptx==0.6.21
# via -r requirements/extra-pptx.in
xlsxwriter==3.1.3
xlsxwriter==3.1.4
# via python-pptx

View File

@ -13,7 +13,9 @@ numpy==1.24.4
openpyxl==3.1.2
# via -r requirements/extra-xlsx.in
pandas==2.0.3
# via -r requirements/extra-xlsx.in
# via
# -c requirements/constraints.in
# -r requirements/extra-xlsx.in
python-dateutil==2.8.2
# via pandas
pytz==2023.3.post1

View File

@ -24,7 +24,7 @@ filelock==3.12.4
# transformers
fsspec==2023.9.1
# via huggingface-hub
huggingface-hub==0.17.1
huggingface-hub==0.17.2
# via transformers
idna==3.4
# via
@ -93,7 +93,7 @@ tqdm==4.66.1
# transformers
transformers==4.33.2
# via -r requirements/huggingface.in
typing-extensions==4.7.1
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# huggingface-hub

View File

@ -29,7 +29,7 @@ requests==2.31.0
# via
# -c requirements/base.txt
# pyairtable
typing-extensions==4.7.1
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# pyairtable

View File

@ -39,7 +39,7 @@ charset-normalizer==3.2.0
# -c requirements/base.txt
# aiohttp
# requests
cryptography==41.0.3
cryptography==41.0.4
# via
# azure-identity
# azure-storage-blob
@ -87,7 +87,7 @@ six==1.16.0
# via
# azure-core
# isodate
typing-extensions==4.7.1
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# azure-core

View File

@ -21,7 +21,7 @@ charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.3
cryptography==41.0.4
# via boxsdk
fsspec==2023.9.1
# via

View File

@ -57,7 +57,7 @@ google-auth-oauthlib==1.1.0
# via gcsfs
google-cloud-core==2.3.3
# via google-cloud-storage
google-cloud-storage==2.10.0
google-cloud-storage==2.11.0
# via gcsfs
google-crc32c==1.5.0
# via google-resumable-media

View File

@ -17,7 +17,7 @@ charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.3
cryptography==41.0.4
# via pyjwt
deprecated==1.2.14
# via pygithub

View File

@ -17,7 +17,7 @@ charset-normalizer==3.2.0
# requests
google-api-core==2.11.1
# via google-api-python-client
google-api-python-client==2.99.0
google-api-python-client==2.100.0
# via -r requirements/ingest-google-drive.in
google-auth==2.23.0
# via

View File

@ -21,7 +21,7 @@ charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.3
cryptography==41.0.4
# via
# msal
# pyjwt

View File

@ -15,7 +15,7 @@ charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.3
cryptography==41.0.4
# via
# msal
# pyjwt

View File

@ -48,7 +48,7 @@ s3fs==2023.9.1
# via -r requirements/ingest-s3.in
six==1.16.0
# via python-dateutil
typing-extensions==4.7.1
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# aioitertools

View File

@ -17,7 +17,7 @@ charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.3
cryptography==41.0.4
# via simple-salesforce
idna==3.4
# via

View File

@ -15,7 +15,7 @@ charset-normalizer==3.2.0
# via
# -c requirements/base.txt
# requests
cryptography==41.0.3
cryptography==41.0.4
# via
# msal
# pyjwt

View File

@ -111,13 +111,13 @@ types-click==7.1.8
# via -r requirements/test.in
types-markdown==3.4.2.10
# via -r requirements/test.in
types-requests==2.31.0.2
types-requests==2.31.0.3
# via -r requirements/test.in
types-tabulate==0.9.0.3
# via -r requirements/test.in
types-urllib3==1.26.25.14
# via types-requests
typing-extensions==4.7.1
typing-extensions==4.8.0
# via
# -c requirements/base.txt
# black

View File

@ -9,16 +9,6 @@
},
"text": "Data in Brief 22 (2019) 451457"
},
{
"type": "Image",
"element_id": "70d50409ea726a2789ebbd004bec31f4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
"text": "Contents lists available at ScienceDirect Data in Brief journal homepage: www.elsevier.com/locate/dib"
},
{
"type": "NarrativeText",
"element_id": "b0658ce9dccc0acba9a472c2bb992cc9",
@ -39,6 +29,16 @@
},
"text": "Data in Brief"
},
{
"type": "NarrativeText",
"element_id": "9234133787d0a6b3976b16569c0b5cf3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
"text": "journal homepage: www.elsevier.com/locate/dib"
},
{
"type": "Title",
"element_id": "0ccb3a9876bbc64a1ca09fa40c4f844d",
@ -399,16 +399,6 @@
},
"text": "s s o"
},
{
"type": "Title",
"element_id": "1bd621f0b71079e0948b0aad011a7f4b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
"text": "t h g e W"
},
{
"type": "Title",
"element_id": "acac86c0e609ca906f632b0e2dacccb2",
@ -419,6 +409,16 @@
},
"text": "l"
},
{
"type": "Title",
"element_id": "1bd621f0b71079e0948b0aad011a7f4b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
"text": "t h g e W"
},
{
"type": "Title",
"element_id": "de7d1b721a1e0632b7cf04edf5032c8e",
@ -439,6 +439,16 @@
},
"text": "(mg)"
},
{
"type": "UncategorizedText",
"element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
"text": "30"
},
{
"type": "UncategorizedText",
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
@ -459,16 +469,6 @@
},
"text": "10"
},
{
"type": "UncategorizedText",
"element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
"text": "30"
},
{
"type": "Title",
"element_id": "d300d49efc4cd0982dd6bc3377759ae8",
@ -589,6 +589,26 @@
},
"text": "%"
},
{
"type": "NarrativeText",
"element_id": "4f0139b605dfdd9eb93e920a6115e1b5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": ") r a e y / m m"
},
{
"type": "UncategorizedText",
"element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "("
},
{
"type": "NarrativeText",
"element_id": "49e7364ce1027887460959b2a757b184",
@ -609,16 +629,6 @@
},
"text": "i"
},
{
"type": "NarrativeText",
"element_id": "74599fca46202613cccb12e97774b306",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "E n o i t i b h n I"
},
{
"type": "Title",
"element_id": "de7d1b721a1e0632b7cf04edf5032c8e",
@ -629,6 +639,26 @@
},
"text": "i"
},
{
"type": "UncategorizedText",
"element_id": "ba5ec51d07a4ac0e951608704431d59a",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": ")"
},
{
"type": "NarrativeText",
"element_id": "74599fca46202613cccb12e97774b306",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "E n o i t i b h n I"
},
{
"type": "Title",
"element_id": "de7d1b721a1e0632b7cf04edf5032c8e",
@ -651,63 +681,13 @@
},
{
"type": "UncategorizedText",
"element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca",
"element_id": "525fbe4b6760bd759bfeeae2ee487f12",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "("
},
{
"type": "UncategorizedText",
"element_id": "ba5ec51d07a4ac0e951608704431d59a",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": ")"
},
{
"type": "NarrativeText",
"element_id": "4f0139b605dfdd9eb93e920a6115e1b5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": ") r a e y / m m"
},
{
"type": "UncategorizedText",
"element_id": "6c19ccbab57f4e9a47a14c0c50211272",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "(mm/year) 100 4 80 4 Efficiency (%) 1 _—__. SS v- 74 —~X_ Senn, ~~. —__, ~ ol, T T T T T T T 1"
},
{
"type": "UncategorizedText",
"element_id": "8139b33952401b3ee0e2ca84651cb9a1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "0.9"
},
{
"type": "UncategorizedText",
"element_id": "a97b042d7bd59d92a46e8ab17f7dff73",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "1.8"
"text": "(mm/year) 100 4 80 4 Efficiency (%) 1 _—__. —o— SS v- —a— 74 —~X_ Senn, —y— ~~. —6~ —__, ~ —o- ol, T T T T T T T 1"
},
{
"type": "UncategorizedText",
@ -719,6 +699,26 @@
},
"text": "2.7"
},
{
"type": "UncategorizedText",
"element_id": "a97b042d7bd59d92a46e8ab17f7dff73",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "1.8"
},
{
"type": "UncategorizedText",
"element_id": "8139b33952401b3ee0e2ca84651cb9a1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "0.9"
},
{
"type": "UncategorizedText",
"element_id": "ad57366865126e55649ecb23ae1d4888",
@ -729,6 +729,76 @@
},
"text": "100"
},
{
"type": "UncategorizedText",
"element_id": "4a44dc15364204a80fe80e9039455cc1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "10"
},
{
"type": "UncategorizedText",
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "20"
},
{
"type": "UncategorizedText",
"element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "30"
},
{
"type": "UncategorizedText",
"element_id": "d59eced1ded07f84c145592f65bdf854",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "40"
},
{
"type": "UncategorizedText",
"element_id": "1a6562590ef19d1045d06c4055742d38",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "50"
},
{
"type": "UncategorizedText",
"element_id": "ff5a1ae012afa5d4c889c50ad427aaf5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "70"
},
{
"type": "UncategorizedText",
"element_id": "39fa9ec190eee7b6f4dff1100d6343e1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "60"
},
{
"type": "UncategorizedText",
"element_id": "48449a14a4ff7d79bb7a1b6f3d488eba",
@ -749,76 +819,6 @@
},
"text": "90"
},
{
"type": "UncategorizedText",
"element_id": "39fa9ec190eee7b6f4dff1100d6343e1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "60"
},
{
"type": "UncategorizedText",
"element_id": "4a44dc15364204a80fe80e9039455cc1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "10"
},
{
"type": "UncategorizedText",
"element_id": "1a6562590ef19d1045d06c4055742d38",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "50"
},
{
"type": "UncategorizedText",
"element_id": "d59eced1ded07f84c145592f65bdf854",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "40"
},
{
"type": "UncategorizedText",
"element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "30"
},
{
"type": "UncategorizedText",
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "20"
},
{
"type": "UncategorizedText",
"element_id": "ff5a1ae012afa5d4c889c50ad427aaf5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "70"
},
{
"type": "UncategorizedText",
"element_id": "5feceb66ffc86f38d952786c6d696c79",
@ -1119,16 +1119,6 @@
},
"text": "Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42.121 0.9476 6 0.3233 0.0540 5.39E-05 373.180 0.4318 8 0.1240 0.0556 5.46E-05 305.650 0.3772 10 0.0382 0.0086 1.24E-05 246.080 0.0919"
},
{
"type": "UncategorizedText",
"element_id": "9492908fadeab22ca81f18f2ba4f4f35",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "0 2 4 6 8 10"
},
{
"type": "Title",
"element_id": "362d4a20958df0c88550b9e5d1f2ef5b",
@ -1139,6 +1129,16 @@
},
"text": "Inhibitor concentration (g)"
},
{
"type": "UncategorizedText",
"element_id": "9492908fadeab22ca81f18f2ba4f4f35",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "0 2 4 6 8 10"
},
{
"type": "Title",
"element_id": "bcf00b4904f5661d6baef52e7e09e9b1",
@ -1199,16 +1199,6 @@
},
"text": "(cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356"
},
{
"type": "UncategorizedText",
"element_id": "d71f426079cb8c2bb3d960ce1e23d290",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05"
},
{
"type": "Title",
"element_id": "6978574f5e6e70a2883ea5ea51aa34f7",
@ -1219,6 +1209,16 @@
},
"text": "icorr (A/cm2)"
},
{
"type": "UncategorizedText",
"element_id": "d71f426079cb8c2bb3d960ce1e23d290",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05"
},
{
"type": "Title",
"element_id": "7507a06cf675785949d6312f1776e444",
@ -1301,23 +1301,13 @@
},
{
"type": "UncategorizedText",
"element_id": "d4735e3a265e16eee03f59718b9b5d03",
"element_id": "2c624232cdd221771294dfbb310aca00",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "2"
},
{
"type": "UncategorizedText",
"element_id": "4b227777d4dd1fc61c6f884f48641d02",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "4"
"text": "8"
},
{
"type": "UncategorizedText",
@ -1331,23 +1321,33 @@
},
{
"type": "UncategorizedText",
"element_id": "2c624232cdd221771294dfbb310aca00",
"element_id": "4b227777d4dd1fc61c6f884f48641d02",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "8"
"text": "4"
},
{
"type": "Title",
"element_id": "48baae83e27f90cd89699a178ab01c46",
"type": "UncategorizedText",
"element_id": "d4735e3a265e16eee03f59718b9b5d03",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "2+ T T T 1"
"text": "2"
},
{
"type": "UncategorizedText",
"element_id": "a0dfa682f99b0794f40f195f9a7adfcd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "—=—Cc/0 2+ T T T 1"
},
{
"type": "UncategorizedText",

View File

@ -9,16 +9,6 @@
},
"text": "Data in Brief 22 (2019) 484487"
},
{
"type": "Image",
"element_id": "70d50409ea726a2789ebbd004bec31f4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
"text": "Contents lists available at ScienceDirect Data in Brief journal homepage: www.elsevier.com/locate/dib"
},
{
"type": "NarrativeText",
"element_id": "b0658ce9dccc0acba9a472c2bb992cc9",
@ -39,6 +29,16 @@
},
"text": "Data in Brief"
},
{
"type": "NarrativeText",
"element_id": "9234133787d0a6b3976b16569c0b5cf3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
"text": "journal homepage: www.elsevier.com/locate/dib"
},
{
"type": "Title",
"element_id": "0ccb3a9876bbc64a1ca09fa40c4f844d",

View File

@ -770,16 +770,6 @@
},
"text": "1 ocr_agent = lp . TesseractAgent () 2 # Can be easily switched to other OCR software 3 tokens = ocr_agent . detect ( image )"
},
{
"type": "Image",
"element_id": "65ac0f9ae348b12ed9484b8af7296617",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 7
},
"text": "ocr_agent = lp.TesseractAgent ()pOi"
},
{
"type": "NarrativeText",
"element_id": "7a151dbbe8b26ccdcb264ab005be5a36",

View File

@ -1026,8 +1026,8 @@
"text": "In terms of accidents, hydropower is the deadliest electricity generator, mostly due to collapsing dams and the consequences of flooding. The Banqiao Dam failure in 1975 led to at least 26,000 people drowning, and as many as 150,000 deaths resulting from the secondary effects of the accident. In comparison, radiation exposure following Chernobyl caused 54 deaths2, while no casualties due to radiation are likely to occur from the accident at Fukushima Daiichi."
},
{
"type": "NarrativeText",
"element_id": "f9bb49945b60897227abdd75b5f8d39b",
"type": "UncategorizedText",
"element_id": "b7a56873cd771f2c446d369b649430b6",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@ -1041,11 +1041,11 @@
"filetype": "application/pdf",
"page_number": 5
},
"text": "r e p s e i t i l"
"text": "25"
},
{
"type": "Title",
"element_id": "f83714d89302473e0e4f5399bd50e7a9",
"type": "UncategorizedText",
"element_id": "6a3adc54db5128f797d4a12855193373",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@ -1059,7 +1059,43 @@
"filetype": "application/pdf",
"page_number": 5
},
"text": "W T"
"text": "24.6"
},
{
"type": "UncategorizedText",
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 5
},
"text": "20"
},
{
"type": "UncategorizedText",
"element_id": "dfb6b8c404e0fa2b32def4ba49e00b3c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 5
},
"text": "18.4"
},
{
"type": "NarrativeText",
@ -1079,24 +1115,6 @@
},
"text": "r a e y"
},
{
"type": "Title",
"element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 5
},
"text": "a t a F"
},
{
"type": "UncategorizedText",
"element_id": "dca468ba69cda6650ce03d976c274c66",
@ -1152,8 +1170,8 @@
"text": "15"
},
{
"type": "UncategorizedText",
"element_id": "b7a56873cd771f2c446d369b649430b6",
"type": "Title",
"element_id": "f83714d89302473e0e4f5399bd50e7a9",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@ -1167,7 +1185,43 @@
"filetype": "application/pdf",
"page_number": 5
},
"text": "25"
"text": "W T"
},
{
"type": "NarrativeText",
"element_id": "f9bb49945b60897227abdd75b5f8d39b",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 5
},
"text": "r e p s e i t i l"
},
{
"type": "Title",
"element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 5
},
"text": "a t a F"
},
{
"type": "UncategorizedText",
@ -1187,24 +1241,6 @@
},
"text": "10"
},
{
"type": "UncategorizedText",
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 5
},
"text": "20"
},
{
"type": "UncategorizedText",
"element_id": "ef2d127de37b942baad06145e54b0c61",
@ -1241,6 +1277,24 @@
},
"text": "0"
},
{
"type": "UncategorizedText",
"element_id": "8bf40d0515e8461bd30866c2eb8ac250",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 5
},
"text": "4.6"
},
{
"type": "Title",
"element_id": "51229f9593cbcb7c8e25059c004d67b0",
@ -1277,42 +1331,6 @@
},
"text": "C oal"
},
{
"type": "UncategorizedText",
"element_id": "6a3adc54db5128f797d4a12855193373",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 5
},
"text": "24.6"
},
{
"type": "UncategorizedText",
"element_id": "dfb6b8c404e0fa2b32def4ba49e00b3c",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 5
},
"text": "18.4"
},
{
"type": "Title",
"element_id": "2378bdd2cf4f491cf401e6b215cbb4fd",
@ -1349,24 +1367,6 @@
},
"text": "Bio m ass"
},
{
"type": "UncategorizedText",
"element_id": "8bf40d0515e8461bd30866c2eb8ac250",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 5
},
"text": "4.6"
},
{
"type": "Title",
"element_id": "4fabb98454d019811a732c4a09f31bf0",
@ -1836,8 +1836,8 @@
"text": "If the potential of nuclear energy is to be fully realized, public health and safety approaches must be recalibrated to consider a wider range of factors when considering radiation, adopting an “all-hazards” approach. Such an approach must ensure that risks are placed within a proper perspective and context, rather than looking at them in isolation. We therefore must not look at the costs be they economic, environmental, or public health associated with an individual power plant in isolation, but rather the costs associated with it (and its alternatives) at a societal level (Figure 4). This would entail looking at the potential risks arising from the use of nuclear power and comparing these with the risks associated with not adopting nuclear power."
},
{
"type": "Image",
"element_id": "c9889d326ca46635644c051ced3cdde5",
"type": "Title",
"element_id": "7ec686735b6e51f8276b057051369b15",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@ -1851,25 +1851,7 @@
"filetype": "application/pdf",
"page_number": 7
},
"text": "Plant-levelproduction costsat market prices Grid-level costsof the electricitysystem ber Jest—"
},
{
"type": "Image",
"element_id": "2550e9a8245a64cdb4de02c91133865a",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 7
},
"text": "Plant-levelproduction costsat market prices"
"text": "ae) flea"
},
{
"type": "Title",
@ -2628,8 +2610,8 @@
"text": "8"
},
{
"type": "Image",
"element_id": "6d647fc38c561c01f7859e019345d367",
"type": "UncategorizedText",
"element_id": "481e5a54650b0a4ac7bc2568ddad436d",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
@ -2643,7 +2625,25 @@
"filetype": "application/pdf",
"page_number": 12
},
"text": "+44 (0)20 7451 1520www.world-nuclear.orginfo@world-nuclear.org World Nuclear Association is the international organization that represents the global nuclear industry. Its mission is to promote a wider understanding of nuclear energy among key international influencers by producing authoritative information, developing common industry positions, and contributing to the energy debate."
"text": "World Nuclear Association Tower House 10 Southampton Street London WC2E 7HA United Kingdom"
},
{
"type": "NarrativeText",
"element_id": "36d3613fc20527bb317afd4e447d1c74",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 12
},
"text": "Recalibrating risk © 2021 World Nuclear Association. Registered in England and Wales, company number 01215741"
},
{
"type": "Title",
@ -2663,24 +2663,6 @@
},
"text": "+44 (0)20 7451 1520 www.world-nuclear.org info@world-nuclear.org"
},
{
"type": "UncategorizedText",
"element_id": "481e5a54650b0a4ac7bc2568ddad436d",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 12
},
"text": "World Nuclear Association Tower House 10 Southampton Street London WC2E 7HA United Kingdom"
},
{
"type": "NarrativeText",
"element_id": "20ef77d9aa66e60f1443750cdbaa9014",
@ -2698,23 +2680,5 @@
"page_number": 12
},
"text": "World Nuclear Association is the international organization that represents the global nuclear industry. Its mission is to promote a wider understanding of nuclear energy among key international influencers by producing authoritative information, developing common industry positions, and contributing to the energy debate."
},
{
"type": "NarrativeText",
"element_id": "36d3613fc20527bb317afd4e447d1c74",
"metadata": {
"data_source": {
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
"version": 306475068461766865312866697521104206816,
"record_locator": {
"protocol": "s3",
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
},
"date_modified": "2023-02-12T10:09:32"
},
"filetype": "application/pdf",
"page_number": 12
},
"text": "Recalibrating risk © 2021 World Nuclear Association. Registered in England and Wales, company number 01215741"
}
]

View File

@ -142,6 +142,7 @@ class ElementMetadata:
attached_to_filename: Optional[str] = None
parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
category_depth: Optional[int] = None
image_path: Optional[str] = None
# Page numbers currenlty supported for PDF, HTML and PPT documents
page_number: Optional[int] = None

View File

@ -269,10 +269,12 @@ def _add_element_metadata(
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
coordinate_system: Optional[CoordinateSystem] = None,
section: Optional[str] = None,
image_path: Optional[str] = None,
**kwargs,
) -> Element:
"""Adds document metadata to the document element. Document metadata includes information
like the filename, source url, and page number."""
coordinates_metadata = (
CoordinatesMetadata(
points=coordinates,
@ -314,6 +316,7 @@ def _add_element_metadata(
emphasized_text_tags=emphasized_text_tags,
section=section,
category_depth=depth,
image_path=image_path,
)
# NOTE(newel) - Element metadata is being merged into
# newly constructed metadata, not the other way around
@ -570,6 +573,11 @@ def document_to_element_list(
coordinates = (
element.metadata.coordinates.points if element.metadata.coordinates else None
)
el_image_path = (
layout_element.image_path if hasattr(layout_element, "image_path") else None
)
_add_element_metadata(
element,
page_number=i + 1,
@ -577,6 +585,7 @@ def document_to_element_list(
coordinates=coordinates,
coordinate_system=coordinate_system,
category_depth=element.metadata.category_depth,
image_path=el_image_path,
**kwargs,
)

View File

@ -310,29 +310,37 @@ def _partition_pdf_or_image_local(
ocr_languages = prepare_languages_for_tesseract(languages)
model_name = model_name if model_name else os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")
pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False)
image_output_dir_path = kwargs.get("image_output_dir_path", None)
process_with_model_kwargs = {
"is_image": is_image,
"ocr_languages": ocr_languages,
"ocr_mode": ocr_mode,
"extract_tables": infer_table_structure,
"model_name": model_name,
}
process_with_model_extra_kwargs = {
"pdf_image_dpi": pdf_image_dpi,
"extract_images_in_pdf": extract_images_in_pdf,
"image_output_dir_path": image_output_dir_path,
}
for key, value in process_with_model_extra_kwargs.items():
if value:
process_with_model_kwargs[key] = value
if file is None:
pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
process_file_with_model_kwargs = {
"is_image": is_image,
"ocr_languages": ocr_languages,
"ocr_mode": ocr_mode,
"extract_tables": infer_table_structure,
"model_name": model_name,
}
if pdf_image_dpi:
process_file_with_model_kwargs["pdf_image_dpi"] = pdf_image_dpi
layout = process_file_with_model(
filename,
**process_file_with_model_kwargs,
**process_with_model_kwargs,
)
else:
layout = process_data_with_model(
file,
is_image=is_image,
ocr_languages=ocr_languages,
ocr_mode=ocr_mode,
extract_tables=infer_table_structure,
model_name=model_name,
**process_with_model_kwargs,
)
elements = document_to_element_list(
layout,
@ -345,17 +353,22 @@ def _partition_pdf_or_image_local(
infer_list_items=False,
**kwargs,
)
out_elements = []
out_elements = []
for el in elements:
if (isinstance(el, PageBreak) and not include_page_breaks) or (
# NOTE(crag): small chunks of text from Image elements tend to be garbage
isinstance(el, Image)
and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
):
if isinstance(el, PageBreak) and not include_page_breaks:
continue
if isinstance(el, Image):
# NOTE(crag): small chunks of text from Image elements tend to be garbage
if not el.metadata.image_path and (
el.text is None or len(el.text) < 24 or el.text.find(" ") == -1
):
continue
else:
out_elements.append(cast(Element, el))
# NOTE(crag): this is probably always a Text object, but check for the sake of typing
if isinstance(el, Text):
elif isinstance(el, Text):
el.text = re.sub(
RE_MULTISPACE_INCLUDING_NEWLINES,
" ",