enhancement: implement full-page OCR(#1133)

*implements full-page OCR as supported in unstructured-inference=0.5.11.
This commit is contained in:
Christine Straub 2023-08-16 12:16:35 -07:00 committed by GitHub
parent be093d2e66
commit 0a23139720
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 105 additions and 105 deletions

View File

@ -1,4 +1,10 @@
## 0.10.1-dev0
## 0.10.1-dev1
### Enhancements
* Bump unstructured-inference==0.5.10:
- implement full-page OCR
### Features
### Fixes
* Fix dead links in repository README (Quick Start > Install for local development, and Learn more > Batch Processing)

View File

@ -25,3 +25,5 @@ Pillow<10.0.0
# NOTE(alan) Pinned to avoid error that occurs with 2.4.3:
# AttributeError: 'ResourcePath' object has no attribute 'collection'
Office365-REST-Python-Client<2.4.3
# NOTE(christine) Pinned to set the `unstructured-inference` version
unstructured-inference==0.5.10

View File

@ -6,4 +6,4 @@ pdfminer.six
# NOTE(robinson) - See this issue here
# https://github.com/facebookresearch/detectron2/issues/5010
Pillow<10
unstructured-inference==0.5.9
unstructured-inference

View File

@ -205,8 +205,10 @@ typing-extensions==4.7.1
# torch
tzdata==2023.3
# via pandas
unstructured-inference==0.5.9
# via -r requirements/extra-pdf-image.in
unstructured-inference==0.5.10
# via
# -c requirements/constraints.in
# -r requirements/extra-pdf-image.in
urllib3==1.26.16
# via
# -c requirements/base.txt

View File

@ -1,17 +1,17 @@
[
{
"type": "Title",
"element_id": "0c4e18d78e721c8179f3946b75b17d15",
"element_id": "88591a76b54e47215c0827ae8838ec13",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Instructions for Form 3115 (Rev. November 1987) Annlicatinn far Chance in Accounting Mathond"
"text": "Instructions for Form 3115 (Rev. November 1987)"
},
{
"type": "NarrativeText",
"element_id": "41f3d9c83b2b4679195c9796134fd8f5",
"element_id": "766cf1d1243ef2cdbb0db5ad32d7f9c9",
"metadata": {
"data_source": {},
"filetype": "image/png",
@ -21,7 +21,7 @@
},
{
"type": "ListItem",
"element_id": "97968e4ba14bd2d082a70ec61ef2d9b1",
"element_id": "36a565493a214d3f7e7f24794c1dc7f4",
"metadata": {
"data_source": {},
"filetype": "image/png",
@ -111,7 +111,7 @@
},
{
"type": "ListItem",
"element_id": "f0d2beb7f43493694a91137e8e65b5f3",
"element_id": "59bc2945a7f606bd5078bac3bc1199d4",
"metadata": {
"data_source": {},
"filetype": "image/png",
@ -121,7 +121,7 @@
},
{
"type": "ListItem",
"element_id": "13f2a282f705590fbe7b6ce15b08862a",
"element_id": "5157d731aa6a97c9b166799db2295bce",
"metadata": {
"data_source": {},
"filetype": "image/png",
@ -141,7 +141,7 @@
},
{
"type": "ListItem",
"element_id": "9820f79275e683f5afe3f2f1283de4ca",
"element_id": "34b66452ca63c465c69d849e4acf6d46",
"metadata": {
"data_source": {},
"filetype": "image/png",
@ -161,7 +161,7 @@
},
{
"type": "ListItem",
"element_id": "a98378f4a88db65dff42b7d8bd75be92",
"element_id": "b0fa5aaff0cee8574822dd8ac6537c06",
"metadata": {
"data_source": {},
"filetype": "image/png",
@ -181,7 +181,7 @@
},
{
"type": "ListItem",
"element_id": "3cb57c50002187a715e1c5048e643c65",
"element_id": "13f155c0754434406190f3cf49c82c3c",
"metadata": {
"data_source": {},
"filetype": "image/png",
@ -201,33 +201,33 @@
},
{
"type": "ListItem",
"element_id": "beeb50db70ce1aa76813cce98e46bd56",
"element_id": "178d6933ed193747b1c4aa1c048e7f94",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "for these changes. Tb od Db bee Cl"
"text": "for these changes."
},
{
"type": "NarrativeText",
"element_id": "640a100da1a3bee6f1f134c51a2c8648",
"element_id": "7685df2334a5f6c8c8099dea61a8f1b4",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed"
"text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed."
},
{
"type": "Title",
"element_id": "a232d246e22a4f6bb8dcab62cffb2567",
"element_id": "61ed58fa51293f429f87e8cf1896c9e4",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Paperwork Reduction Act Notice We ack for thic infarenatinn te marry mye the."
"text": "Paperwork Reduction Act Notice"
},
{
"type": "Title",
@ -241,37 +241,27 @@
},
{
"type": "ListItem",
"element_id": "58f1649a32eda8b8c513e51a209666a6",
"element_id": "5f8051f8010896bab02aaf784c04ae02",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Signature Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page"
},
{
"type": "ListItem",
"element_id": "586e989b479e4362ebe28a6954c1427b",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "If the individual or firm is also authorized to"
"text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page"
},
{
"type": "NarrativeText",
"element_id": "446ccb7d96fea659d50aef8a6dd670df",
"element_id": "4660422c06dddc914ab634c5e4045dec",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the right amount of tax. You are required to give us this information,"
"text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the nght amount of tax. You are required to give us this information."
},
{
"type": "Title",
"element_id": "226fa83297914d5195e002508d61fb1d",
"element_id": "a1547a4ed1611eee44b15e99120fb978",
"metadata": {
"data_source": {},
"filetype": "image/png",
@ -281,77 +271,77 @@
},
{
"type": "Title",
"element_id": "f0e951e5bcb4a6070fa6672b37822348",
"element_id": "68a3289177b49b285e133a5267eb355f",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Purpose of Form Cin bce Secon te cece cget."
"text": "Purpose of Form"
},
{
"type": "NarrativeText",
"element_id": "5e5451e052baf894b2bdad4132f6cd2f",
"element_id": "f9b8e17da7a31507773f78959378e09c",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "ee File this form to request a change in your accounting method, including the accounting treatment of any item. if you are requesting 2 change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods,"
"text": "File this form to request a change in your accounting method, including the accounting treatment of any item. if you are requesting 2 change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods,"
},
{
"type": "NarrativeText",
"element_id": "cc1701e3ce9347e344b3df80d426bd21",
"element_id": "b3859f2f29884b1d3ba0892e52859a99",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Seti aes When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)"
"text": "When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)"
},
{
"type": "NarrativeText",
"element_id": "b81dc18d0f8666f9bf7400a00657dc72",
"element_id": "e5a95dc10d4071983b70898a21f11175",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "POMS SANE OPFOR DA 29). Generally, applicants must complete Section A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired. You must give alll relevant facts, including a"
"text": "Generally, applicants must complete Section A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired."
},
{
"type": "Title",
"element_id": "c7502aa5b000d6446f3eca882518a260",
"element_id": "5756fb398995bb6518a87637f24f426e",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Time and Place for Filing amarall, ammlimeete maet file snete"
"text": "Time and Place for Filing"
},
{
"type": "NarrativeText",
"element_id": "8b35e7c212710b1099b675ce9394fb47",
"element_id": "25f830e7c39c115c9937eb9d11cfb1f2",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Se NB ON State whether you desire a conference in the National Office if the Service proposes to disapprove your application."
"text": "State whether you desire a conference in the National Office if the Service proposes to disapprove your application"
},
{
"type": "Title",
"element_id": "0a16a0fea889be77576c0fd88575554a",
"element_id": "8b06cd6e2bf7fc15130d5d9ed7e66283",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Affiliated Groups Tavmayare that ara mam)"
"text": "Affiliated Groups"
},
{
"type": "Title",
"element_id": "68b58298cabd9069c975b192a7183139",
"element_id": "242a9dba10a04654d4adef9c58ff96f6",
"metadata": {
"data_source": {},
"filetype": "image/png",
@ -361,62 +351,62 @@
},
{
"type": "Title",
"element_id": "6a8881a6e87021b2362243f7df3e4b1d",
"element_id": "11c98a9cbd6a200fbc5b93fed15007ac",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Uniform capitalization rules and limitation on cash method.—If you are required to char"
"text": "Uniform capitalization rules and limitation on"
},
{
"type": "Title",
"element_id": "8daeb8b48fb666f1dd54e2af283d0c22",
"element_id": "58703de56debc34a1d68e6ed6f8fd067",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Specific Instructions Section A Neem Ea mama 1 !Taeahle inemes"
"text": "Specific Instructions Section A"
},
{
"type": "Title",
"element_id": "09203a0c6955f64ca8eb52cd6ea47034",
"element_id": "a4316c02df07840f1beb56609cb09735",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Late Applications Me coup armlimatinm te ler"
"text": "Late Applications"
},
{
"type": "NarrativeText",
"element_id": "962e3f0ceb1f0b1b08a1c19adde8d962",
"element_id": "39458f370b98a606db29ac6dee975e07",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "lethal elaine bela Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and the basis for that conclusion. Identify the"
"text": "Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and"
},
{
"type": "Title",
"element_id": "bfe98eb672d95c15a11ed3e618928b4e",
"element_id": "025a65465b6fd9635316e92633b24c7e",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "Identifying Number Ndiuidesale Am omptisoehesal"
"text": "Identifying Number"
},
{
"type": "NarrativeText",
"element_id": "87f8128b03a72c616ee1a1bb91e11c56",
"element_id": "9240bfa889b87dc2fb3fa746ca4eeeb4",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
"text": "—e—e—— eee Others.-—The employer identification number of an applicant other than an individual should be entered in this block,"
"text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block,"
}
]

View File

@ -1111,13 +1111,13 @@
},
{
"type": "FigureCaption",
"element_id": "b5ee6af3d776b0bbd2e581a3ab2ab2e1",
"element_id": "27b45633a0f31b9e01d179d70d7dc282",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "Potential (Vv)nm°in°}aryT T T0.00001 0.001 olCurrent Density (A/cm2)"
"text": "5 1 os = — 10; =o ° © —\" 205 i —~é é —ip a5 — Control -2 — & 2.5 T T T 0.0000001 + —-0.00001 0.001 O14 Current Density (A/cm2)"
},
{
"type": "UncategorizedText",
@ -1141,13 +1141,13 @@
},
{
"type": "Table",
"element_id": "e2ed41967a486766ad6a122cc3aba4d5",
"element_id": "9270ab0a1b3ba26a16991abcd0b45dfe",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "Inhibitorconcentration (g) bc (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm2) Polarizationresistance (Ω) Corrosionrate (mm/year) 0246810 0.03351.94600.01630.32330.12400.0382 0.04090.05960.23690.05400.05560.0086 (cid:3) 0.9393(cid:3) 0.8276(cid:3) 0.8825(cid:3) 0.8027(cid:3) 0.5896(cid:3) 0.5356 0.00030.00020.00015.39E-055.46E-051.24E-05 24.0910121.44042.121373.180305.650246.080 2.81631.50540.94760.43180.37720.0919"
"text": "Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42.121 0.9476 6 0.3233 0.0540 5.39E-05 373.180 0.4318 8 0.1240 0.0556 5.46E-05 305.650 0.3772 10 0.0382 0.0086 1.24E-05 246.080 0.0919"
},
{
"type": "UncategorizedText",
@ -1471,13 +1471,13 @@
},
{
"type": "FigureCaption",
"element_id": "6959a323ee23c858c3b1411b05db6ebf",
"element_id": "273fb301b173075f79b2cbdab962e2ff",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5
},
"text": "SEM HV: Q0KY WD: 14.89 rmrmDEM MAO: 209 x Det: DOE Pecforsence In nenospact"
"text": "SEM HV: Q0KY WD: 14.89 rmrm 9EM MAO: 209 x Det: DOE Pectomsence In nanospact"
},
{
"type": "NarrativeText",
@ -1491,13 +1491,13 @@
},
{
"type": "FigureCaption",
"element_id": "a0463ca888a6f2c8c3ba40ba47be0f2f",
"element_id": "d04d110c16a4ebc184fa130f09b8d423",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5
},
"text": "gEOOwaeSemny. z00RV | WD: 1424 renn rtirint VEoa3 Tescan20 yin Fertormaros in nancepace|"
"text": "Sem ny. 200 Rv"
},
{
"type": "NarrativeText",
@ -1511,13 +1511,13 @@
},
{
"type": "FigureCaption",
"element_id": "a9bc28448ebad437288bf5538fb09482",
"element_id": "520d1da08c86ce165cd2843e2dc27f98",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5
},
"text": "SEM HY: 20.0KVBEM IAAG: 400 x 5"
"text": "SEMHV: 20.0KV WD: 15.54 mm EM ING: ACO x Dei: OSE"
},
{
"type": "NarrativeText",
@ -1579,6 +1579,16 @@
},
"text": "Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [35]. The structural formula of egg shell powder is shown in Fig. 9."
},
{
"type": "FigureCaption",
"element_id": "060e14f01e484ba252e902cd5c6f94f9",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 6
},
"text": "ou H,;COCHNY OH"
},
{
"type": "UncategorizedText",
"element_id": "c07eeb615f8b0f2d544348b7f0655301",

View File

@ -791,13 +791,13 @@
},
{
"type": "Table",
"element_id": "be8fbf813482eec7fd0e2fc665b4d3bb",
"element_id": "1d8fd023cd0978f7a6500815d2ad0ef6",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
"text": "Instance size (m, n) Average number of (8, 1500)(8, 2000)(8, 2500)(8, 3000)(12, 1500)(12, 2000)(12, 2500)(12, 3000)(16, 1500)(16, 2000)(16, 2500)(16, 3000) Locations Times Vehicles Possible empty travels 568.40672.80923.40977.00566.00732.60875.001119.60581.80778.00879.001087.20 975.201048.001078.001113.20994.001040.601081.001107.40985.401040.601083.201101.60 652.20857.201082.401272.80642.00861.201096.001286.20667.80872.401076.401284.60 668,279.401,195,844.801,866,175.202,705,617.00674,191.001,199,659.801,878,745.202,711,180.40673,585.801,200,560.801,879,387.002,684,983.60"
"text": "Instance size (m, n) Average number of Locations Times Vehicles Possible empty travels (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 ) (16, 3000 1087.20 1101.60 1284.60 2,684,983.60"
},
{
"type": "UncategorizedText",

View File

@ -591,13 +591,13 @@
},
{
"type": "FigureCaption",
"element_id": "00401461c83b8b07511a4864781d8f8d",
"element_id": "812dcaaec927a84d57af36e20adb5ded",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
"text": "Model Customization Community PlatformEfficient Data Annotation ¥y DIA Model HubCustomized Model Training] === | Layout Detection Models | ===OCR Module =— | Layout Data Structure | ==The Core LayoutParser LibraryDIA Pipeline SharingStorage & Visualizationi"
"text": "Efficient Data Annotation Model Customization Document Images Community Platform a >) ¥ DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | ——= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY"
},
{
"type": "NarrativeText",
@ -681,14 +681,14 @@
},
{
"type": "Table",
"element_id": "71e289a268220c21575bb55a73980b83",
"element_id": "34923b77ca76e1808956ade5e766f7c2",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5,
"text_as_html": "<table><thead><th>Dataset</th><th>| Base Model'|</th><th>| Notes</th></thead><tr><td>PubLayNet</td><td>[38] F/M</td><td>Layouts of modern scientific documents</td></tr><tr><td>PRImA [3]</td><td>M</td><td>Layouts of scanned modern magazines and scientific reports</td></tr><tr><td>Newspaper</td><td>F</td><td>Layouts of scanned US newspapers from the 20th century</td></tr><tr><td>TableBank</td><td>F</td><td>Table region on modern scientific and business document</td></tr><tr><td>HJDataset [31]</td><td>F/M</td><td>Layouts of history Japanese documents</td></tr></table>"
},
"text": "Dataset Base Model1 Large Model Notes PubLayNet [38]PRImA [3]Newspaper [17]TableBank [18]HJDataset [31] F / MMFFF / M M--F- Layouts of modern scientific documentsLayouts of scanned modern magazines and scientific reportsLayouts of scanned US newspapers from the 20th centuryTable region on modern scientific and business documentLayouts of history Japanese documents"
"text": "Dataset | Base Model'| Large Model | Notes PubLayNet B8]| F/M M Layouts of modern scientific documents PRImA M - nned modern magazines and scientific reports Newspapei F - canned US newspapers from the 20th century TableBank F F Table region on modern scientific and business document HJDataset F/M - Layouts of history Japanese documents"
},
{
"type": "UncategorizedText",
@ -852,13 +852,13 @@
},
{
"type": "FigureCaption",
"element_id": "2f498bdd91739a7083490999507420a5",
"element_id": "185e67615d123b35d38ea72e0cdb6d99",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 6
},
"text": "33§3 fectange vada8883 Coordinate83 +*Block | [Block | [Read8 Extra features Tet | [Tye | [oder[ coordinatel textblock1 |» , see383 , textblock2 , layout] ]4A list of the layout elementsThe same transformation and operation APIs"
"text": "- ° . 3 a a 4 a 3 oo er 2 § 8 a 8 3 3 £ 4 A g a 9 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff"
},
{
"type": "NarrativeText",
@ -1072,14 +1072,14 @@
},
{
"type": "Table",
"element_id": "548c38f86edc295baf869abe37a0d1cf",
"element_id": "f81d4915b54758e0d4d52af3566bb813",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"text_as_html": "<table><thead><th>Operation Name</th><th></th><th>|</th><th>Description</th></thead><tr><td>block.pad(top, bottom,</td><td>right,</td><td>left) |</td><td>Enlarge the current block according to the input</td></tr><tr><td>block.scale(fx, fy)</td><td></td><td></td><td>Scale the current block given the ratio in x and y direction</td></tr><tr><td>block.shift(dx, dy)</td><td></td><td></td><td>Move the current block with the shift distances in x and y direction</td></tr><tr><td>blocki.is_in(block2)</td><td></td><td>|</td><td>Whether block] is inside of block2</td></tr><tr><td>blocki.intersect (block2)</td><td></td><td></td><td>Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs.</td></tr><tr><td>block1i.union(block2)</td><td></td><td></td><td>Return the union region of blockl and block2. Coordinate type to be determined based on the inputs.</td></tr><tr><td>blocki.relative_to(block2)</td><td></td><td></td><td>Convert the absolute coordinates of block] to relative coordinates to block2</td></tr><tr><td>blocki.condition_on(block2)</td><td></td><td></td><td>Calculate the absolute coordinates of blockl given the canvas block2s absolute coordinates</td></tr></table>"
},
"text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) block.shift(dx, dy) Scale the current block given the ratioin x and y direction Move the current block with the shiftdistances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) block1.union(block2) block1.relative to(block2) block1.condition on(block2) Return the intersection region of block1 and block2.Coordinate type to be determined based on the inputs. Return the union region of block1 and block2.Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block1 torelative coordinates to block2 Calculate the absolute coordinates of block1 giventhe canvas block2s absolute coordinates Obtain the image segments in the block region"
"text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio ion in x and y di block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is_in(block2) Whether block] is inside of block2 ; Return the intersection region of block and block2. block1. intersect (block2) . . . Coordinate type to be determined based on the inputs. ; Return the union region of block1 and block2. block1.union(block2) . . . Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block to block1.relative_to(block2) ' ' relative coordinates to block2 . Calculate the absolute coordinates of block1 given block1.condition_on(block2) . the canvas block2s absolute coordinates block. crop_image (image) Obtain the image segments in the block region"
},
{
"type": "UncategorizedText",
@ -1343,13 +1343,13 @@
},
{
"type": "FigureCaption",
"element_id": "6df6057f894a166cf24fd34f64267f09",
"element_id": "975d6cb141cb0a0313375630ae063fa8",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 9
},
"text": "a ESStee eaeoooMode I: Showing Layout on the Original ImageMode Il: Drawing OCR'd Text at the Correspoding Position10g Bpunog vayoy feyds1q :1 vondoxog Burpunog vay apiH z word"
"text": "x09 Burpunog uayor Aeydsiq 1 vondo 10g Guypunog usyoy apir:z uondo Mode I: Showing Layout on the Original Image Mode Il: Drawing OCR'd Text at the Correspoding Position"
},
{
"type": "NarrativeText",
@ -1413,13 +1413,13 @@
},
{
"type": "FigureCaption",
"element_id": "42aa5660e30073a0282c086fe4f29fce",
"element_id": "2680b3c7a55754a3ba2738cb3d9d5e8b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 10
},
"text": "Column reading orderMaximum Allowed HeightZ. Shen et al.Intra-column reading order(b) Illustration of the recreated document with dense text structure for better OCR performanceToken CategoriesMoteAddresstetNumberVaribiecompany typeColumn Categories(J tite| Aatress(tee[7] section adr"
"text": "et Intra-column reading order Token Categories tie (Adress 2) tee (NE sumber Variable HEE company type Column Categories (J tite we) adaress —_ (7) section Header by e * Column reading order a a (a) Illustration of the original Japanese Maximum Allowed Height BRE B>e EER eR (b) Illustration of the recreated document with dense text structure for better OCR performance"
},
{
"type": "NarrativeText",
@ -1533,13 +1533,13 @@
},
{
"type": "FigureCaption",
"element_id": "55f2474c66877608ca9b463a7076573e",
"element_id": "b33b2bc3b9c416673c7f74c6a00c49d8",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11
},
"text": "(spe peepee,Active Learning Layout=Annotate Layout Dataset parte4zi Deep Learning LayoutLayout Detection Model Training & Inference,Post-processin Handy Data Structures &pl 9 APIs for Layout DataText Recognition Default and Customized: r OCR Models4Visualization & Export | <——Layout StructureVisualization & StorageThe Japanese Document Helpful LayoutParserDigitization Pipeline Modules"
"text": "(spe peepee, Active Learning Layout Annotate Layout Dataset | + Annotation Toolkit ¥ a Deep Leaming Layout Model Training & Inference, ¥ ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <—— Default ane Customized ¥ ee Layout Structure Visualization & Export | <—— | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules"
},
{
"type": "UncategorizedText",
@ -1713,13 +1713,13 @@
},
{
"type": "FigureCaption",
"element_id": "f58d47bde7ebddd81c4a678c918a8f1b",
"element_id": "7d42bb6af1404a95a6e8870d5c4d07bf",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 13
},
"text": "(2) Partial table atthe bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line"
"text": "(@) Partial table at the bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line"
},
{
"type": "NarrativeText",

View File

@ -1701,7 +1701,7 @@
},
{
"type": "FigureCaption",
"element_id": "36ca9b7cdbbcba729a46487cf86c07eb",
"element_id": "eeda9f9210dfe4be7e82b4385290d3ca",
"metadata": {
"data_source": {},
"filetype": "application/pdf",

View File

@ -1119,16 +1119,6 @@
},
"text": "6"
},
{
"type": "FigureCaption",
"element_id": "f58b520072d30c4805940f5c99a306c3",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 9
},
"text": "an ¥3 te,ay."
},
{
"type": "NarrativeText",
"element_id": "d754d8d468346f652657279272a11897",

View File

@ -1 +1 @@
__version__ = "0.10.1-dev0" # pragma: no cover
__version__ = "0.10.1-dev1" # pragma: no cover