feat: improve reading order (#2219)

Closes GH Issue #2208.
This commit is contained in:
Christine Straub 2023-12-07 23:21:10 -08:00 committed by GitHub
parent 46cb3060ac
commit 4ad01efe23
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 160 additions and 52 deletions

1
.gitignore vendored
View File

@ -12,6 +12,7 @@ build/
develop-eggs/
dist/
downloads/
figures/
eggs/
.eggs/
lib/

View File

@ -1,4 +1,4 @@
## 0.11.4-dev5
## 0.11.4-dev6
### Enhancements

View File

@ -154,7 +154,7 @@
"type": "ListItem"
},
{
"element_id": "ff686e6046cd8176988d8dec0d8adac4",
"element_id": "c2036e04407827dd9f895cce7fdb8674",
"metadata": {
"data_source": {
"date_created": "2023-03-10T09:32:44+00:00",

View File

@ -325,7 +325,7 @@
"type": "NarrativeText"
},
{
"element_id": "82391aed75376c2c3bc734ad52ec73e4",
"element_id": "63ffdff8ce24056d9d776dda2adcc934",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -334,7 +334,7 @@
],
"page_number": 2
},
"text": "How data were acquired The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225230. Data format Experimental factors Experimental features Data source location Accessibility Related research article",
"text": "How data were acquired Data format Experimental factors Experimental features Data source location Accessibility Related research article The cleaned and weighed specimen was suspended in beakers con- taining 0.5 M H2SO4 solution of different concentrations of egg shell powder. The pre-weighed stainless steel samples were retrieved from the test solutions after every 24 h, cleaned appropriately, dried and reweighed. Raw, analyzed The difference between the weight at a given time and the initial weight of the specimen was taken as the weight loss, which was used to calculate the corrosion rate and inhibition efficiency. Inhibitor concentration, exposure time Department of Chemical, Metallurgical and Materials Engineering, Tshwane University of Technology, Pretoria, South Africa Data are available within this article O. Sanni, A. P. I. Popoola, and O. S. I. Fayomi, Enhanced corrosion resistance of stainless steel type 316 in sulphuric acid solution using eco-friendly waste product, Results in Physics, 9 (2018) 225230.",
"type": "Table"
},
{
@ -429,7 +429,7 @@
"type": "NarrativeText"
},
{
"element_id": "ba559032c2f9f98c24e4c547af135b8e",
"element_id": "3fcf35e8b67240541d3f2bf3bc0a39c5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -438,7 +438,7 @@
],
"page_number": 2
},
"text": "30 10g 8g 6g 4g 2g Control ) g m ( s s o 20 l t h g e W i 10 48 96 144 192 ",
"text": "30 ) g m ( s s o l t h g e W i 20 10g 8g 6g 4g 2g Control 10 48 96 144 192 ",
"type": "Image"
},
{
@ -481,7 +481,7 @@
"type": "NarrativeText"
},
{
"element_id": "f2e384e79a4fbce052f262a93ec46102",
"element_id": "9fd8126152fe50909dc643c92ff6cd4c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -490,7 +490,7 @@
],
"page_number": 3
},
"text": "2.7 1.8 10g 8g 6g 4g 2g Control 0.9 24 48 72 96 120 144 168 192 Exposure time ",
"text": "2.7 1.8 0.9 10g 8g 6g 4g 2g Control 24 48 72 96 120 144 168 192 Exposure time ",
"type": "Image"
},
{
@ -585,7 +585,7 @@
"type": "UncategorizedText"
},
{
"element_id": "00c6c21aa97f59dc84190f023eaaf769",
"element_id": "c1daf0ef9e2135894ab832147233a7f3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -594,7 +594,7 @@
],
"page_number": 3
},
"text": "90 2g 4g 6g 8g 10g 80 ) % 70 ( y c n e c i f f 60 i 50 E n o i t i b h n I 40 i 30 20 10 0 20 40 60 80 100 120 140 160 180 ",
"text": "90 ) % ( y c n e c i f f i E n o i t i b h n I i 80 70 60 50 40 30 2g 4g 6g 8g 10g 20 10 0 20 40 60 80 100 120 140 160 180 ",
"type": "Image"
},
{
@ -715,7 +715,7 @@
"type": "UncategorizedText"
},
{
"element_id": "150b064badb909ac7549f8064cf2caba",
"element_id": "a31f676a690660014de4c38544212163",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -724,7 +724,7 @@
],
"page_number": 4
},
"text": "icorr (A/cm2) Polarization resistance (Ω) Inhibitor concentration (g) bc (V/dec) ba (V/dec) Ecorr (V) (cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356 0 2 4 6 8 10 0.0335 1.9460 0.0163 0.3233 0.1240 0.0382 0.0409 0.0596 0.2369 0.0540 0.0556 0.0086 0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05 24.0910 121.440 42.121 373.180 305.650 246.080 2.8163 1.5054 0.9476 0.4318 0.3772 0.0919",
"text": "Inhibitor concentration (g) bc (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm2) Polarization resistance (Ω) 0 2 4 6 8 10 0.0335 1.9460 0.0163 0.3233 0.1240 0.0382 0.0409 0.0596 0.2369 0.0540 0.0556 0.0086 (cid:3) 0.9393 (cid:3) 0.8276 (cid:3) 0.8825 (cid:3) 0.8027 (cid:3) 0.5896 (cid:3) 0.5356 0.0003 0.0002 0.0001 5.39E-05 5.46E-05 1.24E-05 24.0910 121.440 42.121 373.180 305.650 246.080 2.8163 1.5054 0.9476 0.4318 0.3772 0.0919",
"type": "Table"
},
{
@ -1092,7 +1092,7 @@
"type": "Formula"
},
{
"element_id": "aa63e6aba52eb53a896d01e2c7ccc133",
"element_id": "13bc6e646e1dea06c3836e074c7fe40f",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -1101,7 +1101,7 @@
],
"page_number": 6
},
"text": "IE ð%Þ ¼ CRo (cid:3) CR 100 1 x CRo",
"text": "IE ð%Þ ¼ CRo (cid:3) CR CRo x 100 1",
"type": "Formula"
},
{

View File

@ -416,7 +416,7 @@
"type": "UncategorizedText"
},
{
"element_id": "4e5faed345ed29d23513a466e412ec0a",
"element_id": "d022b06e927bb8ee92ff9034e08e62de",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -425,7 +425,7 @@
],
"page_number": 2
},
"text": "i , a start location, ls i , and an end location, le i , i , an end time, te and",
"text": "i , a start location, ls i , an end time, te i , and an end location, le i , and",
"type": "NarrativeText"
},
{
@ -637,7 +637,7 @@
"type": "Title"
},
{
"element_id": "ff667ddf988229560eaac54fc38ddc66",
"element_id": "cf21fea12c5e4fac7b8606af479c6edf",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
@ -646,7 +646,7 @@
],
"page_number": 4
},
"text": "Number of lines Number of columns in each line Description 1 1 n 3 m 4 The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j. i , the start i , the end location le l l",
"text": "Number of lines Number of columns in each line Description 1 1 n l 3 m 4 l The number of depots, the number of trips, and the number of locations. The number of vehicles rd at each depot d. One line for each trip, i ¼ 1; 2; …; n. Each line provides the start location ls time ts i and the end time te i for the corresponding trip. Each element, δij; where i; j A 1; 2; …; l, refers to the travel time between location i and location j. i , the start i , the end location le",
"type": "Table"
},
{

View File

@ -760,7 +760,7 @@
"type": "FigureCaption"
},
{
"element_id": "c57f2166778009c6ccc9032ee8883253",
"element_id": "4577aebb24ff9c0fd98387936d5ef4a7",
"metadata": {
"data_source": {
"permissions_data": [
@ -777,7 +777,7 @@
"page_number": 5,
"text_as_html": "<table><thead><th>Dataset</th><th>| Base Model'|</th><th>| Notes</th></thead><tr><td>PubLayNet B8]|</td><td>F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA</td><td>M</td><td>Layouts of scanned modern magazines and scientific report</td></tr><tr><td>Newspaper</td><td>F</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></table>"
},
"text": "Base Model1 Large Model Notes Dataset PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents",
"text": "Dataset Base Model1 Large Model Notes PubLayNet [38] PRImA [3] Newspaper [17] TableBank [18] HJDataset [31] F / M M F F F / M M - - F - Layouts of modern scientific documents Layouts of scanned modern magazines and scientific reports Layouts of scanned US newspapers from the 20th century Table region on modern scientific and business document Layouts of history Japanese documents",
"type": "Table"
},
{
@ -1261,7 +1261,7 @@
"type": "NarrativeText"
},
{
"element_id": "b35cc086edca679ccc52fa6701857549",
"element_id": "35adc4ddaef6ffb21b754c9c350c856f",
"metadata": {
"data_source": {
"permissions_data": [
@ -1278,7 +1278,7 @@
"page_number": 8,
"text_as_html": "<table><thead><th>block.pad(top, bottom,</th><th>right,</th><th>left)</th><th>Enlarge the current block according to the input</th></thead><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>block1.is_in(block2)</td><td></td><td></td><td>Whether block] is inside of block2</td></tr><tr><td>block1. intersect (block2)</td><td></td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.union(block2)</td><td></td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs</td></tr><tr><td>block1.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block to relative coordinates to block2</td></tr><tr><td>block1.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2s absolute coordinates</td></tr><tr><td>block. crop_image (image)</td><td></td><td></td><td>Obtain the image segments in the block region</td></tr></table>"
},
"text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input Scale the current block given the ratio in x and y direction block.scale(fx, fy) Move the current block with the shift distances in x and y direction block.shift(dx, dy) Whether block1 is inside of block2 block1.is in(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.intersect(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.relative to(block2) Calculate the absolute coordinates of block1 given the canvas block2s absolute coordinates block1.condition on(block2) Obtain the image segments in the block region block.crop image(image)",
"text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio in x and y direction block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) Return the intersection region of block1 and block2. Coordinate type to be determined based on the inputs. block1.union(block2) Return the union region of block1 and block2. Coordinate type to be determined based on the inputs. block1.relative to(block2) Convert the absolute coordinates of block1 to relative coordinates to block2 block1.condition on(block2) Calculate the absolute coordinates of block1 given the canvas block2s absolute coordinates block.crop image(image) Obtain the image segments in the block region",
"type": "Table"
},
{

View File

@ -1197,7 +1197,7 @@
"type": "Title"
},
{
"element_id": "429d7ccdab398bfb2107fa00f9054272",
"element_id": "8174b87b76dfe8e8ddb31ab83abc6c33",
"metadata": {
"data_source": {
"date_modified": "2023-02-14T07:31:28",
@ -1214,7 +1214,7 @@
],
"page_number": 7
},
"text": "WEO Projections 1/ Estimate 2022 Projections 2023 Estimate 2022 Projections 2023 2021 2024 2023 2024 2024 6.2 3.4 2.9 3.1 0.2 0.1 1.9 3.2 3.0 Advanced Economies United States Euro Area 5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3 2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8 1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 0.6 1.5 2.0 1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4 0.1 0.4 0.2 0.4 0.0 0.8 0.1 0.2 0.9 0.0 0.3 0.2 0.2 0.2 0.1 0.0 0.4 0.2 0.4 0.3 0.1 0.2 1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4 1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 0.5 1.2 2.1 1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2 Germany France Italy Spain Japan United Kingdom Canada Other Advanced Economies 3/ Emerging Market and Developing Economies Emerging and Developing Asia 6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9 3.9 4.3 3.0 6.8 0.7 2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6 4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2 4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3 0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 0.4 1.1 0.1 0.2 0.1 0.1 0.0 0.0 0.0 0.1 0.6 0.3 0.4 0.2 0.2 0.5 0.0 0.0 0.0 2.5 3.4 2.9 4.3 2.0 4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0 5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5 4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8 China India 4/ Emerging and Developing Europe Russia Latin America and the Caribbean Brazil Mexico Middle East and Central Asia Saudi Arabia Sub-Saharan Africa Nigeria South Africa Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries 6.0 5.5 3.8 4.1 7.0 4.1 3.1 3.7 5.2 5.4 3.8 4.9 2.4 0.7 4.3 3.2 4.0 4.9 2.5 1.8 4.7 3.5 4.1 5.6 0.3 0.0 0.2 0.4 0.4 0.0 0.1 0.3 0.2 0.2 0.0 0.1 1.7 1.8 3.7 . . . 2.5 . . . 2.5 1.2 5.7 . . . 5.0 . . . 2.5 2.0 4.0 . . . 4.1 . . . 10.4 9.4 12.1 5.4 6.6 3.4 2.4 2.3 2.6 3.4 2.7 4.6 0.1 0.0 0.3 0.3 0.4 0.0 . . . . . . . . . . . . . . . . . . . . . . . . . . . 65.8 26.4 39.8 7.0 16.2 6.3 7.1 0.4 3.3 0.1 0.9 0.3 11.2 2.0 9.8 1.4 5.9 0.2",
"text": "2021 Estimate 2022 Projections 2023 2024 WEO Projections 1/ 2023 2024 Estimate 2022 Projections 2023 2024 6.2 3.4 2.9 3.1 0.2 0.1 1.9 3.2 3.0 Advanced Economies United States Euro Area Germany France Italy Spain Japan United Kingdom Canada Other Advanced Economies 3/ 5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3 2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8 1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 0.6 1.5 2.0 1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4 0.1 0.4 0.2 0.4 0.0 0.8 0.1 0.2 0.9 0.0 0.3 0.2 0.2 0.2 0.1 0.0 0.4 0.2 0.4 0.3 0.1 0.2 1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4 1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 0.5 1.2 2.1 1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2 Emerging Market and Developing Economies Emerging and Developing Asia China India 4/ Emerging and Developing Europe Russia Latin America and the Caribbean Brazil Mexico Middle East and Central Asia Saudi Arabia Sub-Saharan Africa Nigeria South Africa 6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9 3.9 4.3 3.0 6.8 0.7 2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6 4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2 4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3 0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 0.4 1.1 0.1 0.2 0.1 0.1 0.0 0.0 0.0 0.1 0.6 0.3 0.4 0.2 0.2 0.5 0.0 0.0 0.0 2.5 3.4 2.9 4.3 2.0 4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0 5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5 4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8 Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries 6.0 5.5 3.8 4.1 7.0 4.1 3.1 3.7 5.2 5.4 3.8 4.9 2.4 0.7 4.3 3.2 4.0 4.9 2.5 1.8 4.7 3.5 4.1 5.6 0.3 0.0 0.2 0.4 0.4 0.0 0.1 0.3 0.2 0.2 0.0 0.1 1.7 1.8 3.7 . . . 2.5 . . . 2.5 1.2 5.7 . . . 5.0 . . . 2.5 2.0 4.0 . . . 4.1 . . . 10.4 9.4 12.1 5.4 6.6 3.4 2.4 2.3 2.6 3.4 2.7 4.6 0.1 0.0 0.3 0.3 0.4 0.0 . . . . . . . . . . . . . . . . . . . . . . . . . . . 65.8 26.4 39.8 7.0 16.2 6.3 7.1 0.4 3.3 0.1 0.9 0.3 11.2 2.0 9.8 1.4 5.9 0.2",
"type": "Table"
},
{
@ -2310,7 +2310,7 @@
"type": "NarrativeText"
},
{
"element_id": "5ad3e97ac0a2d759e059893765b81954",
"element_id": "d750b11efc2f858c7deadb09e3929e1c",
"metadata": {
"data_source": {
"date_modified": "2023-02-14T07:31:28",
@ -2327,7 +2327,7 @@
],
"page_number": 11
},
"text": "United States Euro area China Other AEs Other EMs 7 October 2022 GFSR 6 5 4 3 2 1 0 1 2 3 2006 08 08 10 10 12 12 14 16 14 18 18 20 22 22 06 16 20 ",
"text": "7 6 5 4 United States Euro area China Other AEs Other EMs October 2022 GFSR 3 2 1 0 1 2 3 2006 08 08 06 10 10 12 12 14 16 14 16 18 18 20 22 22 20 ",
"type": "Image"
},
{
@ -2394,7 +2394,7 @@
"type": "Title"
},
{
"element_id": "5728dbbab19d146278a6a3387e8e40d5",
"element_id": "6215d8f373972db90d05458d63af9efe",
"metadata": {
"data_source": {
"date_modified": "2023-02-14T07:31:28",
@ -2411,7 +2411,7 @@
],
"page_number": 11
},
"text": "Latest October 2022 GFSR 5 6 2. Euro area 1. United States 5 4 4 3 3 2 2 1 1 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 ",
"text": "Latest October 2022 GFSR 6 1. United States 2. Euro area 5 4 3 2 5 4 3 2 1 1 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 Oct. 22 Apr. 23 Oct. 23 Dec. 24 Dec. 26 ",
"type": "Image"
},
{

View File

@ -441,7 +441,7 @@
"type": "UncategorizedText"
},
{
"element_id": "27a3cde643219ef7662f032684e06bd4",
"element_id": "d5e389eb1b6b367ac5cf6e12acccfcbc",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:10:36",
@ -458,7 +458,7 @@
],
"page_number": 4
},
"text": " Marine CSP 40,000 Solar PV 35,000 Geothermal 30,000 Wind Bioenergy 25,000 Hydro 20,000 Nuclear 15,000 Gas 10,000 Oil Coal 5,000 0 ",
"text": " Marine 40,000 CSP 35,000 Solar PV Geothermal 30,000 Wind 25,000 Bioenergy 20,000 Hydro Nuclear 15,000 Gas 10,000 Oil 5,000 Coal 0 ",
"type": "Image"
},
{
@ -1071,7 +1071,7 @@
"type": "Title"
},
{
"element_id": "196f551acb55f0373a9d7fac6c9dbeab",
"element_id": "2a5e6485f55769e5d4c820cb79f018d7",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:10:36",
@ -1088,7 +1088,7 @@
],
"page_number": 7
},
"text": "300 250 200 150 100 50 0 O nshore Wind Offshore Wind N uclear m ercial Photovoltaic C oal C C G T C o m ",
"text": "300 250 200 150 100 50 0 m ercial Photovoltaic C o m O nshore Wind Offshore Wind N uclear C C G T C oal ",
"type": "Image"
},
{
@ -1281,7 +1281,7 @@
"type": "UncategorizedText"
},
{
"element_id": "577fd212dac38df299e478d6b7ce5d74",
"element_id": "2d52b4cb071eb1384e8f64581d907335",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:10:36",
@ -1298,7 +1298,7 @@
],
"page_number": 8
},
"text": "120 120 99.5 100 71.9 80 60 40 20 8.5 1.78 0.245 <0.01 0 Offshore wind O nshore wind (G erm any) C oal Oil N atural gas N uclear* S olar P V (U K) ",
"text": "120 100 120 99.5 80 60 71.9 40 20 0 C oal Oil N atural gas 8.5 1.78 Offshore wind O nshore wind (G erm any) (U K) 0.245 S olar P V <0.01 N uclear* ",
"type": "Image"
},
{
@ -1533,7 +1533,7 @@
"type": "UncategorizedText"
},
{
"element_id": "1459b67becac6e70efecfcbc9312d3f0",
"element_id": "67ff7489d537e35454934b9dc3a725f9",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:10:36",
@ -1550,7 +1550,7 @@
],
"page_number": 8
},
"text": " Coal 90 Gas/Oil 80 Biofuels/Waste Wind/Solar 70 Hydro 60 Nuclear 50 40 30 20 10 ",
"text": "90 Coal Gas/Oil 80 Biofuels/Waste 70 Wind/Solar 60 Hydro Nuclear 50 40 30 20 10 ",
"type": "Image"
},
{
@ -1659,7 +1659,7 @@
"type": "FigureCaption"
},
{
"element_id": "ff963f0df99d82f7c343649121217117",
"element_id": "dddeec4eec1ff6db9e832ed00fea1b7e",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:10:36",
@ -1676,7 +1676,7 @@
],
"page_number": 9
},
"text": "600 Non-hydro 500 ren. & waste Nuclear 400 Natural gas 300 Hydro Oil 200 Coal 100 0 ",
"text": "600 500 Non-hydro ren. & waste 400 Nuclear Natural gas 300 Hydro 200 Oil Coal 100 0 ",
"type": "Image"
},
{

View File

@ -336,7 +336,7 @@
"type": "NarrativeText"
},
{
"element_id": "9405da801e46d0da5f19ea801ff4ff51",
"element_id": "92a15f52537ead259f4d9c2da1b22454",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:09:32",
@ -353,7 +353,7 @@
],
"page_number": 4
},
"text": "Experts 1 Nuclear power 20 2 Motor vehicles 1 3 Handguns 4 4 Smoking 2 17 Electric power (non-nuclear) 9 22 X-rays 7 30 Vaccinations 25",
"text": "Experts 1 20 Nuclear power Motor vehicles 2 1 4 3 Handguns 2 4 Smoking Electric power (non-nuclear) 9 17 22 7 X-rays 25 30 Vaccinations",
"type": "Table"
},
{
@ -630,7 +630,7 @@
"type": "UncategorizedText"
},
{
"element_id": "0bcb3759fa68b68d784c3c3963253c90",
"element_id": "44f0d817d4311d9da996b2cb20dc80c8",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:09:32",
@ -647,7 +647,7 @@
],
"page_number": 5
},
"text": "25 20 18.4 15 10 4.6 5 2.8 0.07 0.04 0.02 0.01 0 C oal Oil Bio m ass N atural gas Wind H ydropo w er S olar N uclear ",
"text": "25 20 18.4 15 10 5 4.6 2.8 0 C oal Oil Bio m ass N atural gas 0.07 Wind 0.04 H ydropo w er 0.02 S olar 0.01 N uclear ",
"type": "Image"
},
{
@ -987,7 +987,7 @@
"type": "NarrativeText"
},
{
"element_id": "73ffa3745f99b6332d0ddfac674755c6",
"element_id": "79de44b69099529ba9f79b31427cad59",
"metadata": {
"data_source": {
"date_modified": "2023-02-12T10:09:32",
@ -1004,7 +1004,7 @@
],
"page_number": 7
},
"text": "Social and environmental costs of emissions, land-use, climate change, security of supply, etc. Plant-level production costs at market prices Grid-level costs of the electricity system ",
"text": "Plant-level production costs at market prices Grid-level costs of the electricity system Social and environmental costs of emissions, land-use, climate change, security of supply, etc. ",
"type": "Image"
},
{

View File

@ -1,3 +1,3 @@
metric average sample_sd population_sd count
cct-accuracy 0.803 0.249 0.241 16
cct-%missing 0.024 0.033 0.032 16
cct-accuracy 0.803 0.248 0.241 16
cct-%missing 0.025 0.033 0.032 16

1 metric average sample_sd population_sd count
2 cct-accuracy 0.803 0.249 0.248 0.241 16
3 cct-%missing 0.024 0.025 0.033 0.032 16

View File

@ -11,7 +11,7 @@ ideas-page.html html local 0.929 0.033
UDHR_first_article_all.txt txt local-single-file 0.995 0.0
fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.945 0.029
2023-Jan-economic-outlook.pdf pdf s3 0.846 0.039
layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.949 0.029
2023-Jan-economic-outlook.pdf pdf s3 0.845 0.039
page-with-formula.pdf pdf s3 0.971 0.021
recalibrating-risk-report.pdf pdf s3 0.973 0.007
recalibrating-risk-report.pdf pdf s3 0.968 0.008

1 filename doctype connector cct-accuracy cct-%missing
11 UDHR_first_article_all.txt txt local-single-file 0.995 0.0
12 fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
13 layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
14 layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.945 0.949 0.029
15 2023-Jan-economic-outlook.pdf pdf s3 0.846 0.845 0.039
16 page-with-formula.pdf pdf s3 0.971 0.021
17 recalibrating-risk-report.pdf pdf s3 0.973 0.968 0.007 0.008

View File

@ -1 +1 @@
__version__ = "0.11.4-dev5" # pragma: no cover
__version__ = "0.11.4-dev6" # pragma: no cover

View File

@ -18,6 +18,7 @@ from unstructured.partition.pdf_image.pdfminer_utils import (
rect_to_bbox,
)
from unstructured.partition.utils.constants import Source
from unstructured.partition.utils.sorting import sort_text_regions
if TYPE_CHECKING:
from unstructured_inference.inference.layout import DocumentLayout
@ -95,7 +96,7 @@ def get_regions_by_pdfminer(
layouts = []
# Coefficient to rescale bounding box to be compatible with images
coef = dpi / 72
for i, (page, page_layout) in enumerate(open_pdfminer_pages_generator(fp)):
for page, page_layout in open_pdfminer_pages_generator(fp):
height = page_layout.height
layout: List["TextRegion"] = []
@ -125,7 +126,13 @@ def get_regions_by_pdfminer(
if text_region.bbox is not None and text_region.bbox.area > 0:
layout.append(text_region)
# NOTE(christine): always do the basic sort first for deterministic order across
# python versions.
layout = order_layout(layout)
# apply the current default sorting to the layout elements extracted by pdfminer
layout = sort_text_regions(layout)
layouts.append(layout)
return layouts

View File

@ -1,5 +1,5 @@
import os
from typing import List, Tuple
from typing import TYPE_CHECKING, Any, List, Tuple
import numpy as np
@ -8,6 +8,9 @@ from unstructured.logger import trace_logger
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_XY_CUT
from unstructured.partition.utils.xycut import recursive_xy_cut, recursive_xy_cut_swapped
if TYPE_CHECKING:
from unstructured_inference.inference.elements import TextRegion
def coordinates_to_bbox(coordinates: CoordinatesMetadata) -> Tuple[int, int, int, int]:
"""
@ -73,6 +76,24 @@ def coord_has_valid_points(coordinates: CoordinatesMetadata) -> bool:
return True
def bbox_is_valid(bbox: Any) -> bool:
"""
Verifies all 4 values in a bounding box exist and are positive.
"""
if not bbox:
return False
if len(bbox) != 4:
return False
for v in bbox:
try:
if v < 0:
return False
except TypeError:
return False
return True
def sort_page_elements(
page_elements: List[Element],
sort_mode: str = SORT_MODE_XY_CUT,
@ -163,3 +184,82 @@ def sort_page_elements(
sorted_page_elements = page_elements
return sorted_page_elements
def sort_bboxes_by_xy_cut(
bboxes,
shrink_factor: float = 0.9,
xy_cut_primary_direction: str = "x",
):
"""Sort bounding boxes using XY-cut algorithm."""
shrunken_bboxes = []
for bbox in bboxes:
shrunken_bbox = shrink_bbox(bbox, shrink_factor)
shrunken_bboxes.append(shrunken_bbox)
res: List[int] = []
xy_cut_sorting_func = (
recursive_xy_cut_swapped if xy_cut_primary_direction == "x" else recursive_xy_cut
)
xy_cut_sorting_func(
np.asarray(shrunken_bboxes).astype(int),
np.arange(len(shrunken_bboxes)),
res,
)
return res
def sort_text_regions(
elements: List["TextRegion"],
sort_mode: str = SORT_MODE_XY_CUT,
shrink_factor: float = 0.9,
xy_cut_primary_direction: str = "x",
) -> List["TextRegion"]:
"""Sort a list of TextRegion elements based on the specified sorting mode."""
if not elements:
return elements
bboxes = [(el.bbox.x1, el.bbox.y1, el.bbox.x2, el.bbox.y2) for el in elements]
def _bboxes_ok(strict_points: bool):
warned = False
for bbox in bboxes:
if bbox is None:
trace_logger.detail( # type: ignore
"some or all elements are missing bboxes, skipping sort",
)
return False
elif not bbox_is_valid(bbox):
if not warned:
trace_logger.detail(f"bbox {bbox} does not have valid values") # type: ignore
warned = True
if strict_points:
return False
return True
if sort_mode == SORT_MODE_XY_CUT:
if not _bboxes_ok(strict_points=True):
return elements
shrink_factor = float(
os.environ.get("UNSTRUCTURED_XY_CUT_BBOX_SHRINK_FACTOR", shrink_factor),
)
xy_cut_primary_direction = os.environ.get(
"UNSTRUCTURED_XY_CUT_PRIMARY_DIRECTION",
xy_cut_primary_direction,
)
res = sort_bboxes_by_xy_cut(
bboxes=bboxes,
shrink_factor=shrink_factor,
xy_cut_primary_direction=xy_cut_primary_direction,
)
sorted_elements = [elements[i] for i in res]
else:
sorted_elements = elements
return sorted_elements