From dfa17bd3a0c476dce571b8b493dd2ff80ddaebc1 Mon Sep 17 00:00:00 2001 From: cragwolfe Date: Fri, 4 Apr 2025 14:38:23 -0700 Subject: [PATCH] fix: hi_res PDF parsing: only uncategorized text for extracted elements (#3975) --- CHANGELOG.md | 3 +- .../partition/pdf_image/test_pdf.py | 4 +- test_unstructured/partition/test_msg.py | 2 +- .../biomed-api/65/11/main.PMC6312790.pdf.html | 30 +++---- .../biomed-api/75/29/main.PMC6312793.pdf.html | 28 +++--- .../07/07/sbaa031.073.PMC7234218.pdf.html | 4 +- .../recalibrating-risk-report.pdf.html | 86 +++++++++---------- .../layout-parser-paper-with-table.jpg.html | 4 +- .../layout-parser-paper.pdf.html | 54 ++++++------ .../biomed-api/65/11/main.PMC6312790.pdf.json | 20 ++--- .../biomed-api/75/29/main.PMC6312793.pdf.json | 18 ++-- .../07/07/sbaa031.073.PMC7234218.pdf.json | 2 +- .../recalibrating-risk-report.pdf.json | 44 +++++----- .../layout-parser-paper-with-table.jpg.json | 2 +- .../layout-parser-paper.pdf.json | 30 +++---- unstructured/__version__.py | 2 +- unstructured/partition/pdf.py | 5 +- 17 files changed, 171 insertions(+), 167 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ad3afdfc3..baa69aae9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,11 @@ -## 0.17.6-dev0 +## 0.17.6-dev1 ### Enhancements ### Features ### Fixes +- **Do not use NLP to determine element types for extracted elements with hi_res.** This avoids extraneous Title elements in hi_res outputs. This only applies to *extracted* elements, meaning text objects that are found outside of Object Detection objects which get mapped to *inferred* elements. (*extracted* and *inferred* elements get merged together to form the list of `Element`s returned by `pdf_partition()`) ## 0.17.5 diff --git a/test_unstructured/partition/pdf_image/test_pdf.py b/test_unstructured/partition/pdf_image/test_pdf.py index 6d1145eb8..7a0c8ff29 100644 --- a/test_unstructured/partition/pdf_image/test_pdf.py +++ b/test_unstructured/partition/pdf_image/test_pdf.py @@ -823,8 +823,8 @@ def test_partition_categorization_backup(): example_doc_path("pdf/layout-parser-paper-fast.pdf"), strategy=PartitionStrategy.HI_RES, ) - # Should have changed the element class from Text to Title - assert isinstance(elements[0], Title) + # Should NOT have changed the element class from Text to Title + assert isinstance(elements[0], Text) assert elements[0].text == text diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index d1d66876e..94b12d557 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -141,7 +141,7 @@ def test_partition_msg_can_process_attachments(): "Text", "Text", "Image", - "Title", + "Text", "Text", "Title", "Title", diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html index a55cccdbb..210109c06 100644 --- a/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/biomed-api/65/11/main.PMC6312790.pdf.html @@ -14,9 +14,9 @@

Contents lists available at ScienceDirect

-

+

Data in Brief -

+

journal homepage: www.elsevier.com/locate/dib

@@ -28,19 +28,19 @@ Data on environmental sustainable corrosion inhibitor for stainless steel in aggressive environment -

+

(Jee -

+

Omotayo Sanni n, Abimbola Patricia I. Popoola

Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa

-

+

a r t i c l e i n f o

-

+

a b s t r a c t

@@ -88,19 +88,19 @@

Value of the data

-

+

© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel

  • Type 316 in 0.5 M H2SO4 medium. The given data describe the inhibitive performance of eco-friendly egg shell powder on austenitic stainless steel Type 316 corrosion in sulphuric acid environment.
  • -

    +

    © The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316

  • can be used as basis in determining the inhibitive performance of the same inhibitor in other environments.
  • -

    +

    © The data can be used to examine the relationship between the process variable as it affect the

  • @@ -152,9 +152,9 @@ Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 —0.9393 0.0003 24.0910 2.8163 2 1.9460 0.0596 —0.8276 0.0002 121.440 1.5054 4 0.0163 0.2369 —0.8825 0.0001 42.121 0.9476 6 0.3233 0.0540 —0.8027 5.39E-05 373.180 0.4318 8 0.1240 0.0556 —0.5896 5.46E-05 305.650 0.3772 10 0.0382 0.0086 —0.5356 1.24E-05 246.080 0.0919
    -

    +

    rate (mm/year) -

    +

    The plot of inhibitor concentration over degree of surface coverage versus inhibitor concentration gives a straight line as shown in Fig. 5. The strong correlation reveals that egg shell adsorption on stainless surface in 0.5 M H2SO4 follow Langmuir adsorption isotherm. Figs. 6–8 show the SEM/EDX surface morphology analysis of stainless steel. Figs. 7 and 8 are the SEM/EDX images of the stainless steel specimens without and with inhibitor after weight loss experiment in sulphuric acid medium. The stainless steel surface corrosion product layer in the absence of inhibitor was porous and as a result gives no corrosion protection. With the presence of ES, corrosion damage was minimized, with an evidence of ES present on the metal surface as shown in Fig. 8.

    @@ -232,12 +232,12 @@

    The potentiodynamic polarization method was performed on the prepared test samples immersed in 0.5 M H2SO4 solution in the presence and absence of different ES concentrations. A three electrode system was used; stainless steel Type 316 plate as working electrode with an exposed area of 1.0 cm2, platinum rod as counter electrode and silver chloride electrode as reference electrode. The electrode was polished, degreased in acetone and thoroughly rinsed with distilled water before the experiment. Current density against applied potential was plotted. The slope of the linear part in anodic and cathodic plots gives anodic and cathodic constants according to the Stern–Geary equation, and the

    -

    +

    ð2Þ -

    -

    +

    +

    ð3Þ -

    +

    O. Sanni, A.P.I. Popoola / Data in Brief 22 (2019) 451–457
    diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html index bb95afd2b..aabc7233c 100644 --- a/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/biomed-api/75/29/main.PMC6312793.pdf.html @@ -14,9 +14,9 @@

    Contents lists available at ScienceDirect

    -

    +

    Data in Brief -

    +

    journal homepage: www.elsevier.com/locate/dib

    @@ -28,9 +28,9 @@ A benchmark dataset for the multiple depot vehicle scheduling problem -

    +

    (eee -

    +

    Sarang Kulkarni a,b,c,n, Mohan Krishnamoorthy d,e, Abhiram Ranade f, Andreas T. Ernst c, Rahul Patil b

    @@ -52,16 +52,16 @@

    e School of Information Technology and Electrical Engineering, The University of Queensland, QLD 4072,

    -

    +

    Australia -

    +

    f Department of Computer Science and Engineering, IIT Bombay, Powai, Mumbai 400076, India

    -

    +

    a r t i c l e i n f o

    -

    +

    a b s t r a c t

    @@ -106,13 +106,13 @@

  • © The data provide all the information that is required to model the MDVSP by using the existing mathematical formulations.
  • -

    +

    e All the problem instances are available for use without any restrictions.

  • e The benchmark solutions and solution time for the problem instances are presented in [3] and can be used for the comparison.
  • -

    +

    © The dataset includes a program that can generate similar problem instances of different sizes.

    @@ -121,9 +121,9 @@

    The dataset contains 60 different problem instances of the multiple depot vehicle scheduling pro- blem (MDVSP). Each problem instance is provided in a separate file. Each file is named as ‘RN-m-n-k.dat’, where ‘m’, ‘n’, and ‘k’ denote the number of depots, the number of trips, and the instance number for the size, ‘ðm;nÞ’, respectively. For example, the problem instance, ‘RN-8–1500-01.dat’, is the first problem instance with 8 depots and 1500 trips. For the number of depots, m, we used three values, 8,12, and 16. The four values for the number of trips, n, are 1500, 2000, 2500, and 3000. For each size, ðm;nÞ, five instances are provided. The dataset can be downloaded from https://orlib.uqcloud.net. For each problem instance, the following information is provided:

    -

    +

    The number of depots mð -

    +

    Þ,

    @@ -187,9 +187,9 @@ Instance size (m, n) Average number of Locations Times Vehicles (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 (16, 3000) 1087.20 1101.60 1284.60 2,684,983.60
    -

    +

    Possible empty travels -

    +

    S. Kulkarni et al. / Data in Brief 22 (2019) 484–487
    diff --git a/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html b/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html index 0862a71a2..eabce53c2 100644 --- a/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.html @@ -76,8 +76,8 @@

    Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2,

    -

    +

    AQ3 -

    +

    diff --git a/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html b/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html index c17be23f5..517f7a360 100644 --- a/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/google-drive/recalibrating-risk-report.pdf.html @@ -11,7 +11,7 @@ WORLD ASSOCIATION -

    +

    Recalibrating risk

    @@ -89,69 +89,69 @@ In terms of accidents, hydropower is the deadliest electricity generator, mostly due to collapsing dams and the consequences of flooding. The Banqiao Dam failure in 1975 led to at least 26,000 people drowning, and as many as 150,000 deaths resulting from the secondary effects of the accident. In comparison, radiation exposure following Chernobyl caused 54 deaths2, while no casualties due to radiation are likely to occur from the accident at Fukushima Daiichi.

    25  24.6  20  18.4  e  15  10  5  4.6  2.8  0  Coal  Oil  Bio m ass  Natural gas  0.07  Wind  0.04  Hydropower  0.02  Solar  0.01  Nuclear -

    +

    r -

    -

    +

    +

    a -

    -

    +

    +

    e -

    -

    +

    +

    y -

    -

    +

    +

    W -

    -

    +

    +

    T -

    -

    +

    +

    r -

    -

    +

    +

    e -

    -

    +

    +

    p -

    -

    +

    +

    s -

    +

    8

    -

    +

    e -

    -

    +

    +

    i -

    -

    +

    +

    t -

    -

    +

    +

    i -

    -

    +

    +

    l -

    -

    +

    +

    S -

    -

    +

    +

    a -

    -

    +

    +

    t -

    -

    +

    +

    a -

    -

    +

    +

    F -

    +

    Figure 3. Comparison of number of fatalities due to electricity generation, including accidents and air pollution3

    @@ -251,9 +251,9 @@
  • World Health Organization (2020). Road traffic injuries. Available at: https://www.who.int/news-room/fact-sheets/ detail/road-traffic-injuries
  • -

    +

    i -

    +

  • ii BBC (2020). Plane crash fatalities fell more than 50% in 2019. Available at: https://www.bbc.co.uk/news/ business-50953712
  • diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html index ccc0784c7..dbf342486 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.html @@ -114,9 +114,9 @@
  • import layoutparser as lp
  • -

    +

    wwe -

    +

  • image = cv2.imread("image_file") # load images
  • diff --git a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html index 84e267218..eca4025c8 100644 --- a/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html +++ b/test_unstructured_ingest/expected-structured-output-html/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.html @@ -22,24 +22,24 @@
    2 n u J 1 2 ] V C . s c [ 2 v 8 4 3 5 1 . 3 0 1 2 :
    -

    +

    v -

    -

    +

    +

    arXiv -

    -

    +

    +

    i -

    -

    +

    +

    X -

    -

    +

    +

    r -

    -

    +

    +

    a -

    +

    LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis

    @@ -115,28 +115,28 @@ -

    +

    7 https://ocr-d.de/en/about -

    -

    +

    +

    8 https://github.com/BobLd/DocumentLayoutAnalysis -

    -

    +

    +

    9 https://github.com/leonlulu/DeepLayout -

    -

    +

    +

    10 https://github.com/hpanwar08/detectron2 -

    -

    +

    +

    11 https://github.com/JaidedAI/EasyOCR -

    -

    +

    +

    12 https://github.com/PaddlePaddle/PaddleOCR -

    +

    4

    -

    +

    Z. Shen et al.

    Efficient Data Annotation Model Customization Document Images Community Platform ‘a >) ¥ DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | ——= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY @@ -263,7 +263,7 @@

    6

    -

    +

    Z. Shen et al.

    - ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff @@ -303,7 +303,7 @@

    LayoutParser also comes with a DL-based CNN-RNN OCR model [6] trained with the Connectionist Temporal Classification (CTC) loss [10]. It can be used like the other OCR modules, and can be easily trained on customized datasets.

    -

    +

    13 This is also available in the LayoutParser documentation pages.

  • diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json index 6f6c30b2a..c26c40673 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json @@ -63,7 +63,7 @@ "page_number": 1 }, "text": "Data in Brief", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "97e80c6e7dc2754c9083b263ff65039e", @@ -148,7 +148,7 @@ "page_number": 1 }, "text": "(Jee", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "bddd1cbc864e9b44cc0715a1cccf8dbc", @@ -187,7 +187,7 @@ "page_number": 1 }, "text": "a r t i c l e i n f o", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "b9e48f235de5b531427187eb6ea135fe", @@ -200,7 +200,7 @@ "page_number": 1 }, "text": "a b s t r a c t", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "911bfead9b546998812e2d1d615ecc87", @@ -432,7 +432,7 @@ "page_number": 2 }, "text": "© Data presented here provide optimum conditions of waste material as inhibitor for stainless steel", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "afed004de4c50d761640b6c18729a988", @@ -458,7 +458,7 @@ "page_number": 2 }, "text": "© The data obtained for the inhibition of waste product (egg shell powder) on stainless steel Type 316", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "cb6e8acb9c24820b59f8973cc236ef35", @@ -484,7 +484,7 @@ "page_number": 2 }, "text": "© The data can be used to examine the relationship between the process variable as it affect the", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "e1f7e635d8739a97d8d0000ba8004f61", @@ -744,7 +744,7 @@ "page_number": 4 }, "text": "rate (mm/year)", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "3a5534c2aafc2d8a4c0b65d530d00ab3", @@ -1134,7 +1134,7 @@ "page_number": 6 }, "text": "ð2Þ", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "cff55ae1916232dbda5239f59c897cb9", @@ -1147,7 +1147,7 @@ "page_number": 6 }, "text": "ð3Þ", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "e40c3ee561b10ca5b7a76900c8d5b263", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json index 1fab6122c..17e092312 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json @@ -63,7 +63,7 @@ "page_number": 1 }, "text": "Data in Brief", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "c1b3d4f53698b892fcc23fc10a72e6fb", @@ -148,7 +148,7 @@ "page_number": 1 }, "text": "(eee", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "0cda4eb20070fdf01ec0d47b2a550241", @@ -252,7 +252,7 @@ "page_number": 1 }, "text": "Australia", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "85875ebbc1de554e92edc54674add1d5", @@ -278,7 +278,7 @@ "page_number": 1 }, "text": "a r t i c l e i n f o", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "4f3f69dd17ddae776c656ec73d9837ae", @@ -291,7 +291,7 @@ "page_number": 1 }, "text": "a b s t r a c t", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "34522460857b10c63d8c2c8d2fbb3087", @@ -534,7 +534,7 @@ "page_number": 2 }, "text": "e All the problem instances are available for use without any restrictions.", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "d401597b8ff2854bfb89f2833d02a763", @@ -560,7 +560,7 @@ "page_number": 2 }, "text": "© The dataset includes a program that can generate similar problem instances of different sizes.", - "type": "NarrativeText" + "type": "UncategorizedText" }, { "element_id": "fb765d6762e6a423cb8b9dab27359732", @@ -606,7 +606,7 @@ "page_number": 2 }, "text": "The number of depots mð", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "320f6d28582c354d35673c2a4119851f", @@ -892,7 +892,7 @@ "page_number": 3 }, "text": "Possible empty travels", - "type": "Title" + "type": "UncategorizedText" }, { "element_id": "fa23407a7c3c99ae3b6fb79034698807", diff --git a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json index 3641fcd43..67cd5fb08 100644 --- a/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/biomed-path/07/07/sbaa031.073.PMC7234218.pdf.json @@ -309,6 +309,6 @@ "page_number": 1 }, "text": "AQ3", - "type": "Title" + "type": "UncategorizedText" } ] \ No newline at end of file diff --git a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json index 49e17cb5f..6e7d6aa5f 100644 --- a/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json @@ -186,7 +186,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "7137c1e14141fad3ad306fe68918a967", "text": "Recalibrating risk", "metadata": { @@ -2790,7 +2790,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "a8706e82b3f90cffc996a24348e3b670", "text": "r", "metadata": { @@ -2883,7 +2883,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "da631c23500655c51b9311a61f55744f", "text": "a", "metadata": { @@ -2976,7 +2976,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "d78a11e9e55235934c3a4922053c68e5", "text": "e", "metadata": { @@ -3069,7 +3069,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "8d14df8b7fd7744365fbf8e02d69415a", "text": "y", "metadata": { @@ -3162,7 +3162,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "f4df01bee1b8ffb973ac8539649c5189", "text": "W", "metadata": { @@ -3255,7 +3255,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "b733cf49de269e22bed7c9883b958669", "text": "T", "metadata": { @@ -3348,7 +3348,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "c4b47d788b26c3d5c62ad462ed3ca2db", "text": "r", "metadata": { @@ -3441,7 +3441,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "bff4435574259239761670b31432cc8a", "text": "e", "metadata": { @@ -3534,7 +3534,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "8ba15a3a71eb0bb689c582098cce6730", "text": "p", "metadata": { @@ -3627,7 +3627,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "5fde097ba00ad7647206ae11c721d28c", "text": "s", "metadata": { @@ -3813,7 +3813,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "81f1f3b9da6df38d938bf7871fa069b5", "text": "e", "metadata": { @@ -3906,7 +3906,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "aa4a79651a9a0087b66fcc40a2213113", "text": "i", "metadata": { @@ -3999,7 +3999,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "6d1c0d05d3a424b43d9572188a76c2d4", "text": "t", "metadata": { @@ -4092,7 +4092,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "392a17b2f3eba46f4bcf078e0b204514", "text": "i", "metadata": { @@ -4185,7 +4185,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "d24a9a771e46fdd6b269f1ecaf0b5eec", "text": "l", "metadata": { @@ -4278,7 +4278,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "9dc4537afa8ae0b959a542f9ba5c1e03", "text": "S", "metadata": { @@ -4371,7 +4371,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "919dac2487a4c860747318a132a54a72", "text": "a", "metadata": { @@ -4464,7 +4464,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "04ee5d05c3fcfffd945762e803478600", "text": "t", "metadata": { @@ -4557,7 +4557,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "63dabde368e2cf310d20a885fe50314a", "text": "a", "metadata": { @@ -4650,7 +4650,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "796538927664e4d87312c428469428f5", "text": "F", "metadata": { @@ -8184,7 +8184,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "a95a2add68d668b944cc332c88ea721e", "text": "i", "metadata": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json index 147e62d12..c71cf5096 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper-with-table.jpg.json @@ -177,7 +177,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "9d40bf1b2e2af1692f5689a1c44ab2ae", "text": "wwe", "metadata": { diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json index b9d9f35d1..3f42ca335 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json @@ -110,7 +110,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "4608f9aa33a0cab158565817b0d15743", "text": "v", "metadata": { @@ -132,7 +132,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "6f69e5f921907e689f1a52bd84282b31", "text": "arXiv", "metadata": { @@ -154,7 +154,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "ed4e590932b333f40d0e1367b6b0e32e", "text": "i", "metadata": { @@ -176,7 +176,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "8cb024fb60457b7c572b167801037f75", "text": "X", "metadata": { @@ -198,7 +198,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "c202bdacd2daf4c52fa3a6ddd64a0728", "text": "r", "metadata": { @@ -220,7 +220,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "3db474893ec321c81ef9d1a2afd5f660", "text": "a", "metadata": { @@ -1022,7 +1022,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "db639db124b6064248de0c0dc71510a4", "text": "7 https://ocr-d.de/en/about", "metadata": { @@ -1044,7 +1044,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "d881ce84f017d89f6e35e2bc4b133bfc", "text": "8 https://github.com/BobLd/DocumentLayoutAnalysis", "metadata": { @@ -1066,7 +1066,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "9b96c128deddda1a32c739a2df157496", "text": "9 https://github.com/leonlulu/DeepLayout", "metadata": { @@ -1088,7 +1088,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "5cf72e821375f4480a1529bef97608ef", "text": "10 https://github.com/hpanwar08/detectron2", "metadata": { @@ -1110,7 +1110,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "4ab94e79eedc3a7ac498aaf737ca8878", "text": "11 https://github.com/JaidedAI/EasyOCR", "metadata": { @@ -1132,7 +1132,7 @@ } }, { - "type": "Title", + "type": "UncategorizedText", "element_id": "460b163c13ad7cad4fce325820a76481", "text": "12 https://github.com/PaddlePaddle/PaddleOCR", "metadata": { @@ -1176,7 +1176,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "92c4289ad4af7c0793e40d5662707e0a", "text": "Z. Shen et al.", "metadata": { @@ -1739,7 +1739,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "710ac103981c6363195774b02ee582d4", "text": "Z. Shen et al.", "metadata": { @@ -2083,7 +2083,7 @@ } }, { - "type": "NarrativeText", + "type": "UncategorizedText", "element_id": "a2a0a2ef0279f0710f3cd34474ca8645", "text": "13 This is also available in the LayoutParser documentation pages.", "metadata": { diff --git a/unstructured/__version__.py b/unstructured/__version__.py index db302d22c..1c6678160 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.17.6-dev0" # pragma: no cover +__version__ = "0.17.6-dev1" # pragma: no cover diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py index e0e64854d..d38658ed6 100644 --- a/unstructured/partition/pdf.py +++ b/unstructured/partition/pdf.py @@ -362,7 +362,10 @@ def partition_pdf_or_image( table_ocr_agent=table_ocr_agent, **kwargs, ) - out_elements = _process_uncategorized_text_elements(elements) + # NOTE(crag): do not call _process_uncategorized_text_elements here, because + # extracted elements (which are text blocks outside of OD-determined blocks) + # are likely not Titles and should not be identified as such. + return elements elif strategy == PartitionStrategy.FAST: out_elements = _partition_pdf_with_pdfparser(