mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-25 06:04:53 +00:00
Feat/1332 save embedded images in pdf (#1371)
Addresses [#1332](https://github.com/Unstructured-IO/unstructured/issues/1332) with `unstructured-inference` PR [#208](https://github.com/Unstructured-IO/unstructured-inference/pull/208). ### Summary - Add `image_path` to element metadata - Pass parameters related to extracting images in PDF - Preserve image elements ignored due to garbage text if `el.metadata.image_path` is `True` ### Testing from unstructured.partition.pdf import partition_pdf f_path = "example-docs/embedded-images.pdf" # default image output directory elements = partition_pdf( f_path, strategy=strategy, extract_images_in_pdf=True, ) # specific image output directory elements = partition_pdf( f_path, strategy=strategy, extract_images_in_pdf=True, image_output_dir_path=<directory path>, )
This commit is contained in:
parent
92ad7698fb
commit
2d951722df
@ -3,6 +3,7 @@
|
||||
### Enhancements
|
||||
|
||||
* **Adds data source properties to SharePoint, Outlook, Onedrive, Reddit, and Slack connectors** These properties (date_created, date_modified, version, source_url, record_locator) are written to element metadata during ingest, mapping elements to information about the document source from which they derive. This functionality enables downstream applications to reveal source document applications, e.g. a link to a GDrive doc, Salesforce record, etc.
|
||||
* **Add functionality to save embedded images in PDF's separately as images** This allows users to save embedded images in PDF's separately as images, given some directory path. The saved image path is written to the metadata for the Image element. Downstream applications may benefit by providing users with image links from relevant "hits."
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -105,5 +105,5 @@ urllib3==1.26.16
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
zipp==3.16.2
|
||||
zipp==3.17.0
|
||||
# via importlib-metadata
|
||||
|
||||
BIN
example-docs/embedded-images.pdf
Normal file
BIN
example-docs/embedded-images.pdf
Normal file
Binary file not shown.
@ -50,7 +50,7 @@ tabulate==0.9.0
|
||||
# via -r requirements/base.in
|
||||
tqdm==4.66.1
|
||||
# via nltk
|
||||
typing-extensions==4.7.1
|
||||
typing-extensions==4.8.0
|
||||
# via typing-inspect
|
||||
typing-inspect==0.9.0
|
||||
# via dataclasses-json
|
||||
|
||||
@ -105,5 +105,5 @@ urllib3==1.26.16
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
zipp==3.16.2
|
||||
zipp==3.17.0
|
||||
# via importlib-metadata
|
||||
|
||||
@ -23,7 +23,7 @@ IPython<8.13
|
||||
# AttributeError: 'ResourcePath' object has no attribute 'collection'
|
||||
Office365-REST-Python-Client<2.4.3
|
||||
# NOTE(christine) Pinned to set the `unstructured-inference` version
|
||||
unstructured-inference==0.5.28
|
||||
unstructured-inference==0.5.31
|
||||
# NOTE(klaijan) - Moved pin from test.in
|
||||
# pinning to avoid error in argilla library
|
||||
pydantic<2
|
||||
@ -34,3 +34,5 @@ unstructured.pytesseract>=0.3.12
|
||||
weaviate-client==3.23.2
|
||||
# Note(yuming) - pining to avoid conflict with paddle install
|
||||
matplotlib==3.7.2
|
||||
# NOTE(crag) - pin to available pandas for python 3.8 (at least in CI)
|
||||
pandas<2.1.1
|
||||
|
||||
@ -69,9 +69,7 @@ defusedxml==0.7.1
|
||||
distlib==0.3.7
|
||||
# via virtualenv
|
||||
exceptiongroup==1.1.3
|
||||
# via
|
||||
# -c requirements/test.txt
|
||||
# anyio
|
||||
# via anyio
|
||||
executing==1.2.0
|
||||
# via stack-data
|
||||
fastjsonschema==2.18.0
|
||||
@ -97,7 +95,7 @@ importlib-metadata==6.8.0
|
||||
# jupyterlab
|
||||
# jupyterlab-server
|
||||
# nbconvert
|
||||
importlib-resources==6.0.1
|
||||
importlib-resources==6.1.0
|
||||
# via
|
||||
# jsonschema
|
||||
# jsonschema-specifications
|
||||
@ -133,7 +131,7 @@ json5==0.9.14
|
||||
# via jupyterlab-server
|
||||
jsonpointer==2.4
|
||||
# via jsonschema
|
||||
jsonschema[format-nongpl]==4.19.0
|
||||
jsonschema[format-nongpl]==4.19.1
|
||||
# via
|
||||
# jupyter-events
|
||||
# jupyterlab-server
|
||||
@ -211,7 +209,7 @@ nest-asyncio==1.5.8
|
||||
# via ipykernel
|
||||
nodeenv==1.8.0
|
||||
# via pre-commit
|
||||
notebook==7.0.3
|
||||
notebook==7.0.4
|
||||
# via jupyter
|
||||
notebook-shim==0.2.3
|
||||
# via
|
||||
@ -346,7 +344,6 @@ tinycss2==1.2.1
|
||||
# via nbconvert
|
||||
tomli==2.0.1
|
||||
# via
|
||||
# -c requirements/test.txt
|
||||
# build
|
||||
# jupyterlab
|
||||
# pip-tools
|
||||
@ -376,7 +373,7 @@ traitlets==5.10.0
|
||||
# nbconvert
|
||||
# nbformat
|
||||
# qtconsole
|
||||
typing-extensions==4.7.1
|
||||
typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/test.txt
|
||||
@ -408,7 +405,7 @@ wheel==0.41.2
|
||||
# pip-tools
|
||||
widgetsnbextension==4.0.9
|
||||
# via ipywidgets
|
||||
zipp==3.16.2
|
||||
zipp==3.17.0
|
||||
# via
|
||||
# importlib-metadata
|
||||
# importlib-resources
|
||||
|
||||
@ -9,7 +9,9 @@ numpy==1.24.4
|
||||
# -c requirements/constraints.in
|
||||
# pandas
|
||||
pandas==2.0.3
|
||||
# via -r requirements/extra-csv.in
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# -r requirements/extra-csv.in
|
||||
python-dateutil==2.8.2
|
||||
# via pandas
|
||||
pytz==2023.3.post1
|
||||
|
||||
@ -8,5 +8,5 @@ importlib-metadata==6.8.0
|
||||
# via markdown
|
||||
markdown==3.4.4
|
||||
# via -r requirements/extra-markdown.in
|
||||
zipp==3.16.2
|
||||
zipp==3.17.0
|
||||
# via importlib-metadata
|
||||
|
||||
@ -61,7 +61,7 @@ imgaug==0.4.0
|
||||
# via unstructured-paddleocr
|
||||
importlib-metadata==6.8.0
|
||||
# via flask
|
||||
importlib-resources==6.0.1
|
||||
importlib-resources==6.1.0
|
||||
# via matplotlib
|
||||
itsdangerous==2.1.2
|
||||
# via flask
|
||||
@ -125,7 +125,9 @@ packaging==23.1
|
||||
# scikit-image
|
||||
# visualdl
|
||||
pandas==2.0.3
|
||||
# via visualdl
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# visualdl
|
||||
pdf2image==1.16.3
|
||||
# via unstructured-paddleocr
|
||||
pillow==10.0.1
|
||||
@ -167,7 +169,7 @@ pywavelets==1.4.1
|
||||
# via scikit-image
|
||||
rapidfuzz==3.3.0
|
||||
# via unstructured-paddleocr
|
||||
rarfile==4.0
|
||||
rarfile==4.1
|
||||
# via visualdl
|
||||
requests==2.31.0
|
||||
# via
|
||||
@ -213,7 +215,7 @@ visualdl==2.5.3
|
||||
# via unstructured-paddleocr
|
||||
werkzeug==2.3.7
|
||||
# via flask
|
||||
zipp==3.16.2
|
||||
zipp==3.17.0
|
||||
# via
|
||||
# importlib-metadata
|
||||
# importlib-resources
|
||||
|
||||
@ -22,7 +22,7 @@ coloredlogs==15.0.1
|
||||
# via onnxruntime
|
||||
contourpy==1.1.1
|
||||
# via matplotlib
|
||||
cryptography==41.0.3
|
||||
cryptography==41.0.4
|
||||
# via pdfminer-six
|
||||
cycler==0.11.0
|
||||
# via matplotlib
|
||||
@ -39,7 +39,7 @@ fonttools==4.42.1
|
||||
# via matplotlib
|
||||
fsspec==2023.9.1
|
||||
# via huggingface-hub
|
||||
huggingface-hub==0.17.1
|
||||
huggingface-hub==0.17.2
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
@ -50,7 +50,7 @@ idna==3.4
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
importlib-resources==6.0.1
|
||||
importlib-resources==6.1.0
|
||||
# via matplotlib
|
||||
iopath==0.1.10
|
||||
# via layoutparser
|
||||
@ -88,7 +88,7 @@ omegaconf==2.3.0
|
||||
# via effdet
|
||||
onnx==1.14.1
|
||||
# via unstructured-inference
|
||||
onnxruntime==1.15.1
|
||||
onnxruntime==1.16.0
|
||||
# via unstructured-inference
|
||||
opencv-python==4.8.0.76
|
||||
# via
|
||||
@ -104,7 +104,9 @@ packaging==23.1
|
||||
# transformers
|
||||
# unstructured-pytesseract
|
||||
pandas==2.0.3
|
||||
# via layoutparser
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# layoutparser
|
||||
pdf2image==1.16.3
|
||||
# via
|
||||
# -r requirements/extra-pdf-image.in
|
||||
@ -158,6 +160,8 @@ pyyaml==6.0.1
|
||||
# omegaconf
|
||||
# timm
|
||||
# transformers
|
||||
rapidfuzz==3.3.0
|
||||
# via unstructured-inference
|
||||
regex==2023.8.8
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
@ -206,7 +210,7 @@ tqdm==4.66.1
|
||||
# transformers
|
||||
transformers==4.33.2
|
||||
# via unstructured-inference
|
||||
typing-extensions==4.7.1
|
||||
typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# huggingface-hub
|
||||
@ -215,7 +219,7 @@ typing-extensions==4.7.1
|
||||
# torch
|
||||
tzdata==2023.3
|
||||
# via pandas
|
||||
unstructured-inference==0.5.28
|
||||
unstructured-inference==0.5.31
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# -r requirements/extra-pdf-image.in
|
||||
@ -228,5 +232,5 @@ urllib3==1.26.16
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
zipp==3.16.2
|
||||
zipp==3.17.0
|
||||
# via importlib-resources
|
||||
|
||||
@ -10,5 +10,5 @@ pillow==10.0.1
|
||||
# via python-pptx
|
||||
python-pptx==0.6.21
|
||||
# via -r requirements/extra-pptx.in
|
||||
xlsxwriter==3.1.3
|
||||
xlsxwriter==3.1.4
|
||||
# via python-pptx
|
||||
|
||||
@ -13,7 +13,9 @@ numpy==1.24.4
|
||||
openpyxl==3.1.2
|
||||
# via -r requirements/extra-xlsx.in
|
||||
pandas==2.0.3
|
||||
# via -r requirements/extra-xlsx.in
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
# -r requirements/extra-xlsx.in
|
||||
python-dateutil==2.8.2
|
||||
# via pandas
|
||||
pytz==2023.3.post1
|
||||
|
||||
@ -24,7 +24,7 @@ filelock==3.12.4
|
||||
# transformers
|
||||
fsspec==2023.9.1
|
||||
# via huggingface-hub
|
||||
huggingface-hub==0.17.1
|
||||
huggingface-hub==0.17.2
|
||||
# via transformers
|
||||
idna==3.4
|
||||
# via
|
||||
@ -93,7 +93,7 @@ tqdm==4.66.1
|
||||
# transformers
|
||||
transformers==4.33.2
|
||||
# via -r requirements/huggingface.in
|
||||
typing-extensions==4.7.1
|
||||
typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# huggingface-hub
|
||||
|
||||
@ -29,7 +29,7 @@ requests==2.31.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pyairtable
|
||||
typing-extensions==4.7.1
|
||||
typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# pyairtable
|
||||
|
||||
@ -39,7 +39,7 @@ charset-normalizer==3.2.0
|
||||
# -c requirements/base.txt
|
||||
# aiohttp
|
||||
# requests
|
||||
cryptography==41.0.3
|
||||
cryptography==41.0.4
|
||||
# via
|
||||
# azure-identity
|
||||
# azure-storage-blob
|
||||
@ -87,7 +87,7 @@ six==1.16.0
|
||||
# via
|
||||
# azure-core
|
||||
# isodate
|
||||
typing-extensions==4.7.1
|
||||
typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# azure-core
|
||||
|
||||
@ -21,7 +21,7 @@ charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.3
|
||||
cryptography==41.0.4
|
||||
# via boxsdk
|
||||
fsspec==2023.9.1
|
||||
# via
|
||||
|
||||
@ -57,7 +57,7 @@ google-auth-oauthlib==1.1.0
|
||||
# via gcsfs
|
||||
google-cloud-core==2.3.3
|
||||
# via google-cloud-storage
|
||||
google-cloud-storage==2.10.0
|
||||
google-cloud-storage==2.11.0
|
||||
# via gcsfs
|
||||
google-crc32c==1.5.0
|
||||
# via google-resumable-media
|
||||
|
||||
@ -17,7 +17,7 @@ charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.3
|
||||
cryptography==41.0.4
|
||||
# via pyjwt
|
||||
deprecated==1.2.14
|
||||
# via pygithub
|
||||
|
||||
@ -17,7 +17,7 @@ charset-normalizer==3.2.0
|
||||
# requests
|
||||
google-api-core==2.11.1
|
||||
# via google-api-python-client
|
||||
google-api-python-client==2.99.0
|
||||
google-api-python-client==2.100.0
|
||||
# via -r requirements/ingest-google-drive.in
|
||||
google-auth==2.23.0
|
||||
# via
|
||||
|
||||
@ -21,7 +21,7 @@ charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.3
|
||||
cryptography==41.0.4
|
||||
# via
|
||||
# msal
|
||||
# pyjwt
|
||||
|
||||
@ -15,7 +15,7 @@ charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.3
|
||||
cryptography==41.0.4
|
||||
# via
|
||||
# msal
|
||||
# pyjwt
|
||||
|
||||
@ -48,7 +48,7 @@ s3fs==2023.9.1
|
||||
# via -r requirements/ingest-s3.in
|
||||
six==1.16.0
|
||||
# via python-dateutil
|
||||
typing-extensions==4.7.1
|
||||
typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# aioitertools
|
||||
|
||||
@ -17,7 +17,7 @@ charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.3
|
||||
cryptography==41.0.4
|
||||
# via simple-salesforce
|
||||
idna==3.4
|
||||
# via
|
||||
|
||||
@ -15,7 +15,7 @@ charset-normalizer==3.2.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# requests
|
||||
cryptography==41.0.3
|
||||
cryptography==41.0.4
|
||||
# via
|
||||
# msal
|
||||
# pyjwt
|
||||
|
||||
@ -111,13 +111,13 @@ types-click==7.1.8
|
||||
# via -r requirements/test.in
|
||||
types-markdown==3.4.2.10
|
||||
# via -r requirements/test.in
|
||||
types-requests==2.31.0.2
|
||||
types-requests==2.31.0.3
|
||||
# via -r requirements/test.in
|
||||
types-tabulate==0.9.0.3
|
||||
# via -r requirements/test.in
|
||||
types-urllib3==1.26.25.14
|
||||
# via types-requests
|
||||
typing-extensions==4.7.1
|
||||
typing-extensions==4.8.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# black
|
||||
|
||||
@ -9,16 +9,6 @@
|
||||
},
|
||||
"text": "Data in Brief 22 (2019) 451–457"
|
||||
},
|
||||
{
|
||||
"type": "Image",
|
||||
"element_id": "70d50409ea726a2789ebbd004bec31f4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Contents lists available at ScienceDirect Data in Brief journal homepage: www.elsevier.com/locate/dib"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "b0658ce9dccc0acba9a472c2bb992cc9",
|
||||
@ -39,6 +29,16 @@
|
||||
},
|
||||
"text": "Data in Brief"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "9234133787d0a6b3976b16569c0b5cf3",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "journal homepage: www.elsevier.com/locate/dib"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "0ccb3a9876bbc64a1ca09fa40c4f844d",
|
||||
@ -399,16 +399,6 @@
|
||||
},
|
||||
"text": "s s o"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "1bd621f0b71079e0948b0aad011a7f4b",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "t h g e W"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "acac86c0e609ca906f632b0e2dacccb2",
|
||||
@ -419,6 +409,16 @@
|
||||
},
|
||||
"text": "l"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "1bd621f0b71079e0948b0aad011a7f4b",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "t h g e W"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "de7d1b721a1e0632b7cf04edf5032c8e",
|
||||
@ -439,6 +439,16 @@
|
||||
},
|
||||
"text": "(mg)"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "30"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
|
||||
@ -459,16 +469,6 @@
|
||||
},
|
||||
"text": "10"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "30"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "d300d49efc4cd0982dd6bc3377759ae8",
|
||||
@ -589,6 +589,26 @@
|
||||
},
|
||||
"text": "%"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4f0139b605dfdd9eb93e920a6115e1b5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": ") r a e y / m m"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "("
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "49e7364ce1027887460959b2a757b184",
|
||||
@ -609,16 +629,6 @@
|
||||
},
|
||||
"text": "i"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "74599fca46202613cccb12e97774b306",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "E n o i t i b h n I"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "de7d1b721a1e0632b7cf04edf5032c8e",
|
||||
@ -629,6 +639,26 @@
|
||||
},
|
||||
"text": "i"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "ba5ec51d07a4ac0e951608704431d59a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": ")"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "74599fca46202613cccb12e97774b306",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "E n o i t i b h n I"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "de7d1b721a1e0632b7cf04edf5032c8e",
|
||||
@ -651,63 +681,13 @@
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca",
|
||||
"element_id": "525fbe4b6760bd759bfeeae2ee487f12",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "("
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "ba5ec51d07a4ac0e951608704431d59a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": ")"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "4f0139b605dfdd9eb93e920a6115e1b5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": ") r a e y / m m"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "6c19ccbab57f4e9a47a14c0c50211272",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "(mm/year) 100 4 80 4 Efficiency (%) 1 _—__. SS v- 74 —~X_ Senn, ~~. —__, ~ ol, T T T T T T T 1"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "8139b33952401b3ee0e2ca84651cb9a1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "0.9"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "a97b042d7bd59d92a46e8ab17f7dff73",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "1.8"
|
||||
"text": "(mm/year) 100 4 80 4 Efficiency (%) 1 _—__. —o— SS v- —a— 74 —~X_ Senn, —y— ~~. —6~ —__, ~ —o- ol, T T T T T T T 1"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
@ -719,6 +699,26 @@
|
||||
},
|
||||
"text": "2.7"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "a97b042d7bd59d92a46e8ab17f7dff73",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "1.8"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "8139b33952401b3ee0e2ca84651cb9a1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "0.9"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "ad57366865126e55649ecb23ae1d4888",
|
||||
@ -729,6 +729,76 @@
|
||||
},
|
||||
"text": "100"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "4a44dc15364204a80fe80e9039455cc1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "10"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "20"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "30"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "d59eced1ded07f84c145592f65bdf854",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "40"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "1a6562590ef19d1045d06c4055742d38",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "50"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "ff5a1ae012afa5d4c889c50ad427aaf5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "70"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "39fa9ec190eee7b6f4dff1100d6343e1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "60"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "48449a14a4ff7d79bb7a1b6f3d488eba",
|
||||
@ -749,76 +819,6 @@
|
||||
},
|
||||
"text": "90"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "39fa9ec190eee7b6f4dff1100d6343e1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "60"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "4a44dc15364204a80fe80e9039455cc1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "10"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "1a6562590ef19d1045d06c4055742d38",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "50"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "d59eced1ded07f84c145592f65bdf854",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "40"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "624b60c58c9d8bfb6ff1886c2fd605d2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "30"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "20"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "ff5a1ae012afa5d4c889c50ad427aaf5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 3
|
||||
},
|
||||
"text": "70"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "5feceb66ffc86f38d952786c6d696c79",
|
||||
@ -1119,16 +1119,6 @@
|
||||
},
|
||||
"text": "Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42.121 0.9476 6 0.3233 0.0540 5.39E-05 373.180 0.4318 8 0.1240 0.0556 5.46E-05 305.650 0.3772 10 0.0382 0.0086 1.24E-05 246.080 0.0919"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "9492908fadeab22ca81f18f2ba4f4f35",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "0 2 4 6 8 10"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "362d4a20958df0c88550b9e5d1f2ef5b",
|
||||
@ -1139,6 +1129,16 @@
|
||||
},
|
||||
"text": "Inhibitor concentration (g)"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "9492908fadeab22ca81f18f2ba4f4f35",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "0 2 4 6 8 10"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "bcf00b4904f5661d6baef52e7e09e9b1",
|
||||
@ -1199,16 +1199,6 @@
|
||||
},
|
||||
"text": "(cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "d71f426079cb8c2bb3d960ce1e23d290",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "6978574f5e6e70a2883ea5ea51aa34f7",
|
||||
@ -1219,6 +1209,16 @@
|
||||
},
|
||||
"text": "icorr (A/cm2)"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "d71f426079cb8c2bb3d960ce1e23d290",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "7507a06cf675785949d6312f1776e444",
|
||||
@ -1301,23 +1301,13 @@
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "d4735e3a265e16eee03f59718b9b5d03",
|
||||
"element_id": "2c624232cdd221771294dfbb310aca00",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "2"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "4b227777d4dd1fc61c6f884f48641d02",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "4"
|
||||
"text": "8"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
@ -1331,23 +1321,33 @@
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "2c624232cdd221771294dfbb310aca00",
|
||||
"element_id": "4b227777d4dd1fc61c6f884f48641d02",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "8"
|
||||
"text": "4"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "48baae83e27f90cd89699a178ab01c46",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "d4735e3a265e16eee03f59718b9b5d03",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "2+ T T T 1"
|
||||
"text": "2"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "a0dfa682f99b0794f40f195f9a7adfcd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 4
|
||||
},
|
||||
"text": "—=—Cc/0 2+ T T T 1"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
|
||||
@ -9,16 +9,6 @@
|
||||
},
|
||||
"text": "Data in Brief 22 (2019) 484–487"
|
||||
},
|
||||
{
|
||||
"type": "Image",
|
||||
"element_id": "70d50409ea726a2789ebbd004bec31f4",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Contents lists available at ScienceDirect Data in Brief journal homepage: www.elsevier.com/locate/dib"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "b0658ce9dccc0acba9a472c2bb992cc9",
|
||||
@ -39,6 +29,16 @@
|
||||
},
|
||||
"text": "Data in Brief"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "9234133787d0a6b3976b16569c0b5cf3",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "journal homepage: www.elsevier.com/locate/dib"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "0ccb3a9876bbc64a1ca09fa40c4f844d",
|
||||
|
||||
@ -770,16 +770,6 @@
|
||||
},
|
||||
"text": "1 ocr_agent = lp . TesseractAgent () 2 # Can be easily switched to other OCR software 3 tokens = ocr_agent . detect ( image )"
|
||||
},
|
||||
{
|
||||
"type": "Image",
|
||||
"element_id": "65ac0f9ae348b12ed9484b8af7296617",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 7
|
||||
},
|
||||
"text": "ocr_agent = lp.TesseractAgent ()pOi"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "7a151dbbe8b26ccdcb264ab005be5a36",
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
@ -1026,8 +1026,8 @@
|
||||
"text": "In terms of accidents, hydropower is the deadliest electricity generator, mostly due to collapsing dams and the consequences of flooding. The Banqiao Dam failure in 1975 led to at least 26,000 people drowning, and as many as 150,000 deaths resulting from the secondary effects of the accident. In comparison, radiation exposure following Chernobyl caused 54 deaths2, while no casualties due to radiation are likely to occur from the accident at Fukushima Daiichi."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "f9bb49945b60897227abdd75b5f8d39b",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "b7a56873cd771f2c446d369b649430b6",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
@ -1041,11 +1041,11 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "r e p s e i t i l"
|
||||
"text": "25"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "f83714d89302473e0e4f5399bd50e7a9",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "6a3adc54db5128f797d4a12855193373",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
@ -1059,7 +1059,43 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "W T"
|
||||
"text": "24.6"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "20"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "dfb6b8c404e0fa2b32def4ba49e00b3c",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "18.4"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -1079,24 +1115,6 @@
|
||||
},
|
||||
"text": "r a e y"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "a t a F"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "dca468ba69cda6650ce03d976c274c66",
|
||||
@ -1152,8 +1170,8 @@
|
||||
"text": "15"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "b7a56873cd771f2c446d369b649430b6",
|
||||
"type": "Title",
|
||||
"element_id": "f83714d89302473e0e4f5399bd50e7a9",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
@ -1167,7 +1185,43 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "25"
|
||||
"text": "W T"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "f9bb49945b60897227abdd75b5f8d39b",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "r e p s e i t i l"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "1fb2ec4fc8fc547c0de86ba79ba651e5",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "a t a F"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
@ -1187,24 +1241,6 @@
|
||||
},
|
||||
"text": "10"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "f5ca38f748a1d6eaf726b8a42fb575c3",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "20"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "ef2d127de37b942baad06145e54b0c61",
|
||||
@ -1241,6 +1277,24 @@
|
||||
},
|
||||
"text": "0"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "8bf40d0515e8461bd30866c2eb8ac250",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "4.6"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "51229f9593cbcb7c8e25059c004d67b0",
|
||||
@ -1277,42 +1331,6 @@
|
||||
},
|
||||
"text": "C oal"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "6a3adc54db5128f797d4a12855193373",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "24.6"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "dfb6b8c404e0fa2b32def4ba49e00b3c",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "18.4"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "2378bdd2cf4f491cf401e6b215cbb4fd",
|
||||
@ -1349,24 +1367,6 @@
|
||||
},
|
||||
"text": "Bio m ass"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "8bf40d0515e8461bd30866c2eb8ac250",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 5
|
||||
},
|
||||
"text": "4.6"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "4fabb98454d019811a732c4a09f31bf0",
|
||||
@ -1836,8 +1836,8 @@
|
||||
"text": "If the potential of nuclear energy is to be fully realized, public health and safety approaches must be recalibrated to consider a wider range of factors when considering radiation, adopting an “all-hazards” approach. Such an approach must ensure that risks are placed within a proper perspective and context, rather than looking at them in isolation. We therefore must not look at the costs – be they economic, environmental, or public health – associated with an individual power plant in isolation, but rather the costs associated with it (and its alternatives) at a societal level (Figure 4). This would entail looking at the potential risks arising from the use of nuclear power and comparing these with the risks associated with not adopting nuclear power."
|
||||
},
|
||||
{
|
||||
"type": "Image",
|
||||
"element_id": "c9889d326ca46635644c051ced3cdde5",
|
||||
"type": "Title",
|
||||
"element_id": "7ec686735b6e51f8276b057051369b15",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
@ -1851,25 +1851,7 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 7
|
||||
},
|
||||
"text": "Plant-levelproduction costsat market prices Grid-level costsof the electricitysystem ber Jest—"
|
||||
},
|
||||
{
|
||||
"type": "Image",
|
||||
"element_id": "2550e9a8245a64cdb4de02c91133865a",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 7
|
||||
},
|
||||
"text": "Plant-levelproduction costsat market prices"
|
||||
"text": "ae) flea"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -2628,8 +2610,8 @@
|
||||
"text": "8"
|
||||
},
|
||||
{
|
||||
"type": "Image",
|
||||
"element_id": "6d647fc38c561c01f7859e019345d367",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "481e5a54650b0a4ac7bc2568ddad436d",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
@ -2643,7 +2625,25 @@
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 12
|
||||
},
|
||||
"text": "+44 (0)20 7451 1520www.world-nuclear.orginfo@world-nuclear.org World Nuclear Association is the international organization that represents the global nuclear industry. Its mission is to promote a wider understanding of nuclear energy among key international influencers by producing authoritative information, developing common industry positions, and contributing to the energy debate."
|
||||
"text": "World Nuclear Association Tower House 10 Southampton Street London WC2E 7HA United Kingdom"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "36d3613fc20527bb317afd4e447d1c74",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 12
|
||||
},
|
||||
"text": "Recalibrating risk © 2021 World Nuclear Association. Registered in England and Wales, company number 01215741"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -2663,24 +2663,6 @@
|
||||
},
|
||||
"text": "+44 (0)20 7451 1520 www.world-nuclear.org info@world-nuclear.org"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "481e5a54650b0a4ac7bc2568ddad436d",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 12
|
||||
},
|
||||
"text": "World Nuclear Association Tower House 10 Southampton Street London WC2E 7HA United Kingdom"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "20ef77d9aa66e60f1443750cdbaa9014",
|
||||
@ -2698,23 +2680,5 @@
|
||||
"page_number": 12
|
||||
},
|
||||
"text": "World Nuclear Association is the international organization that represents the global nuclear industry. Its mission is to promote a wider understanding of nuclear energy among key international influencers by producing authoritative information, developing common industry positions, and contributing to the energy debate."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "36d3613fc20527bb317afd4e447d1c74",
|
||||
"metadata": {
|
||||
"data_source": {
|
||||
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
|
||||
"version": 306475068461766865312866697521104206816,
|
||||
"record_locator": {
|
||||
"protocol": "s3",
|
||||
"remote_file_path": "utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf"
|
||||
},
|
||||
"date_modified": "2023-02-12T10:09:32"
|
||||
},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 12
|
||||
},
|
||||
"text": "Recalibrating risk © 2021 World Nuclear Association. Registered in England and Wales, company number 01215741"
|
||||
}
|
||||
]
|
||||
@ -142,6 +142,7 @@ class ElementMetadata:
|
||||
attached_to_filename: Optional[str] = None
|
||||
parent_id: Optional[Union[str, uuid.UUID, NoID, UUID]] = None
|
||||
category_depth: Optional[int] = None
|
||||
image_path: Optional[str] = None
|
||||
|
||||
# Page numbers currenlty supported for PDF, HTML and PPT documents
|
||||
page_number: Optional[int] = None
|
||||
|
||||
@ -269,10 +269,12 @@ def _add_element_metadata(
|
||||
coordinates: Optional[Tuple[Tuple[float, float], ...]] = None,
|
||||
coordinate_system: Optional[CoordinateSystem] = None,
|
||||
section: Optional[str] = None,
|
||||
image_path: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> Element:
|
||||
"""Adds document metadata to the document element. Document metadata includes information
|
||||
like the filename, source url, and page number."""
|
||||
|
||||
coordinates_metadata = (
|
||||
CoordinatesMetadata(
|
||||
points=coordinates,
|
||||
@ -314,6 +316,7 @@ def _add_element_metadata(
|
||||
emphasized_text_tags=emphasized_text_tags,
|
||||
section=section,
|
||||
category_depth=depth,
|
||||
image_path=image_path,
|
||||
)
|
||||
# NOTE(newel) - Element metadata is being merged into
|
||||
# newly constructed metadata, not the other way around
|
||||
@ -570,6 +573,11 @@ def document_to_element_list(
|
||||
coordinates = (
|
||||
element.metadata.coordinates.points if element.metadata.coordinates else None
|
||||
)
|
||||
|
||||
el_image_path = (
|
||||
layout_element.image_path if hasattr(layout_element, "image_path") else None
|
||||
)
|
||||
|
||||
_add_element_metadata(
|
||||
element,
|
||||
page_number=i + 1,
|
||||
@ -577,6 +585,7 @@ def document_to_element_list(
|
||||
coordinates=coordinates,
|
||||
coordinate_system=coordinate_system,
|
||||
category_depth=element.metadata.category_depth,
|
||||
image_path=el_image_path,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@ -310,29 +310,37 @@ def _partition_pdf_or_image_local(
|
||||
ocr_languages = prepare_languages_for_tesseract(languages)
|
||||
|
||||
model_name = model_name if model_name else os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME")
|
||||
pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
|
||||
extract_images_in_pdf = kwargs.get("extract_images_in_pdf", False)
|
||||
image_output_dir_path = kwargs.get("image_output_dir_path", None)
|
||||
|
||||
process_with_model_kwargs = {
|
||||
"is_image": is_image,
|
||||
"ocr_languages": ocr_languages,
|
||||
"ocr_mode": ocr_mode,
|
||||
"extract_tables": infer_table_structure,
|
||||
"model_name": model_name,
|
||||
}
|
||||
|
||||
process_with_model_extra_kwargs = {
|
||||
"pdf_image_dpi": pdf_image_dpi,
|
||||
"extract_images_in_pdf": extract_images_in_pdf,
|
||||
"image_output_dir_path": image_output_dir_path,
|
||||
}
|
||||
|
||||
for key, value in process_with_model_extra_kwargs.items():
|
||||
if value:
|
||||
process_with_model_kwargs[key] = value
|
||||
|
||||
if file is None:
|
||||
pdf_image_dpi = kwargs.pop("pdf_image_dpi", None)
|
||||
process_file_with_model_kwargs = {
|
||||
"is_image": is_image,
|
||||
"ocr_languages": ocr_languages,
|
||||
"ocr_mode": ocr_mode,
|
||||
"extract_tables": infer_table_structure,
|
||||
"model_name": model_name,
|
||||
}
|
||||
if pdf_image_dpi:
|
||||
process_file_with_model_kwargs["pdf_image_dpi"] = pdf_image_dpi
|
||||
layout = process_file_with_model(
|
||||
filename,
|
||||
**process_file_with_model_kwargs,
|
||||
**process_with_model_kwargs,
|
||||
)
|
||||
else:
|
||||
layout = process_data_with_model(
|
||||
file,
|
||||
is_image=is_image,
|
||||
ocr_languages=ocr_languages,
|
||||
ocr_mode=ocr_mode,
|
||||
extract_tables=infer_table_structure,
|
||||
model_name=model_name,
|
||||
**process_with_model_kwargs,
|
||||
)
|
||||
elements = document_to_element_list(
|
||||
layout,
|
||||
@ -345,17 +353,22 @@ def _partition_pdf_or_image_local(
|
||||
infer_list_items=False,
|
||||
**kwargs,
|
||||
)
|
||||
out_elements = []
|
||||
|
||||
out_elements = []
|
||||
for el in elements:
|
||||
if (isinstance(el, PageBreak) and not include_page_breaks) or (
|
||||
# NOTE(crag): small chunks of text from Image elements tend to be garbage
|
||||
isinstance(el, Image)
|
||||
and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
|
||||
):
|
||||
if isinstance(el, PageBreak) and not include_page_breaks:
|
||||
continue
|
||||
|
||||
if isinstance(el, Image):
|
||||
# NOTE(crag): small chunks of text from Image elements tend to be garbage
|
||||
if not el.metadata.image_path and (
|
||||
el.text is None or len(el.text) < 24 or el.text.find(" ") == -1
|
||||
):
|
||||
continue
|
||||
else:
|
||||
out_elements.append(cast(Element, el))
|
||||
# NOTE(crag): this is probably always a Text object, but check for the sake of typing
|
||||
if isinstance(el, Text):
|
||||
elif isinstance(el, Text):
|
||||
el.text = re.sub(
|
||||
RE_MULTISPACE_INCLUDING_NEWLINES,
|
||||
" ",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user