diff --git a/CHANGELOG.md b/CHANGELOG.md
index b68305d17..5d1fbc96e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,7 +1,8 @@
-## 0.10.4-dev0
+## 0.10.4
### Enhancements
* Adds ability to reuse connections per process in unstructured-ingest
+* Pass ocr_mode in partition_pdf and set the default back to individual pages for now
### Features
diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py
index c56768e87..1ab8fb9ba 100644
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@@ -177,6 +177,7 @@ def test_partition_pdf_with_model_name_env_var(
filename,
is_image=False,
ocr_languages="eng",
+ ocr_mode="individual_blocks",
extract_tables=False,
model_name="checkbox",
)
@@ -197,6 +198,7 @@ def test_partition_pdf_with_model_name(
filename,
is_image=False,
ocr_languages="eng",
+ ocr_mode="individual_blocks",
extract_tables=False,
model_name="checkbox",
)
@@ -402,6 +404,7 @@ def test_partition_pdf_with_dpi():
filename,
is_image=False,
ocr_languages="eng",
+ ocr_mode="individual_blocks",
extract_tables=False,
model_name=None,
pdf_image_dpi=100,
diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json
index a0f7dac8f..7e3e0c4c7 100644
--- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.pdf.json
@@ -1,13 +1,13 @@
[
{
"type": "Title",
- "element_id": "88591a76b54e47215c0827ae8838ec13",
+ "element_id": "05ca56aec1964bf626b4012a5b4a7c55",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Instructions for Form 3115 (Rev. November 1987)"
+ "text": "SERGE EECHE PAW VIMY wuUl VivaInstructions for Form 3115(Rev. November 1987)BP nw wo BE oe oe et a ee fia fl ae ee iw OM ee eee Le ye RA. LL. J"
},
{
"type": "NarrativeText",
@@ -31,13 +31,13 @@
},
{
"type": "ListItem",
- "element_id": "14e8cc92f6875b7562c7b37b363a4271",
+ "element_id": "e8d040fcadaf595b3624579225028b80",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Long-term contracts. —If you are required to change your method of accounting for long-term contracts under section"
+ "text": "Long-term contracts. —If you are required tochange your method of accounting for long-termcontracts under section"
},
{
"type": "ListItem",
@@ -121,23 +121,23 @@
},
{
"type": "ListItem",
- "element_id": "2beea67e67a36c0708e98cba96d1779f",
+ "element_id": "64b044a845d6a903604d0edc68d5c8d1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": ", for the notification procedures that must be followed. Other methods.—Unless the Service has published a regulation or procedure to the contrary, all other changes !n accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: ("
+ "text": ", for the notificationprocedures that must be followed.Other methods.—Unless the Service haspublished a regulation or procedure to thecontrary, all other changes !n accountingmethods required by the Act are automaticallyconsidered to be approved by the Commissioner.Examples of method changes automaticallyapproved by the Commissioner are those changesrequired to effect: ("
},
{
"type": "ListItem",
- "element_id": "5157d731aa6a97c9b166799db2295bce",
+ "element_id": "aed90f3480456a62ac47f6cf5c5e526a",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": ") the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section"
+ "text": ") the repeal of the reservemethod for bad debts of taxpayers other thanfinancial institutions (Act section"
},
{
"type": "ListItem",
@@ -151,13 +151,13 @@
},
{
"type": "ListItem",
- "element_id": "34b66452ca63c465c69d849e4acf6d46",
+ "element_id": "fdf216c15df57c2716f392d4cc8b2fbe",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": ") the repeal of the installment method for sales under a revolving credit plan (Act section"
+ "text": ") therepeal of the installment method for sales undera revolving credit plan (Act section"
},
{
"type": "ListItem",
@@ -171,13 +171,33 @@
},
{
"type": "ListItem",
- "element_id": "353b10e26575591f537f9718242cd574",
+ "element_id": "4df2762fd804bd5859df0774d1d51796",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": ") the Inclusion of income attributable to the sale or furnishing of utility services no later than the year In which the services were provided to customers (Act section"
+ "text": ") th"
+ },
+ {
+ "type": "ListItem",
+ "element_id": "6ea7ec2e8449de6c5c662bb59e333fa7",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 1
+ },
+ "text": "nclusion of income attributable to the sale orfurnishing of utility services no later than the yea"
+ },
+ {
+ "type": "ListItem",
+ "element_id": "2a639c819f6663cf3a9940f3528b3205",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 1
+ },
+ "text": "n which the services were provided to customers(Act section"
},
{
"type": "ListItem",
@@ -191,13 +211,13 @@
},
{
"type": "ListItem",
- "element_id": "13f155c0754434406190f3cf49c82c3c",
+ "element_id": "1a278d181295c8d1b6bfd86baca09eaf",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": ") the repeal of the deduction for qualified discount coupons (Act section"
+ "text": ") the repeal of thededuction for qualified discount coupons (Actsection"
},
{
"type": "ListItem",
@@ -211,13 +231,13 @@
},
{
"type": "ListItem",
- "element_id": "178d6933ed193747b1c4aa1c048e7f94",
+ "element_id": "e9cae276abe56d0cb30fcf798f0c134e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "for these changes."
+ "text": "for thesechanges."
},
{
"type": "Title",
@@ -241,13 +261,23 @@
},
{
"type": "ListItem",
- "element_id": "f09181ea8ac5d177b8d2f79bbae03f18",
+ "element_id": "d4b18f9d6e11f561661bef4f8bc5fb7c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Signature Individuals. —An individual desiring the change should sign the application. If the application pertains to a husband and wife filing a joint income tax return, the names of both should appear in the heading and both should sign. Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized to sign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file. For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation. Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrix, etc., having legal authority to sign, and his or her title. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page"
+ "text": "Signatur"
+ },
+ {
+ "type": "ListItem",
+ "element_id": "6e9dc7d49fe15e842fbd7373af8d020a",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 1
+ },
+ "text": "ndividuals. —An individual desiring the changeshould sign the application. If the applicationpertains to a husband and wife filing a jointincome tax return, the names of both shouldappear in the heading and both should sign.Partnerships.—The form should be signed withthe partnership name followed by the signatureof one of the general partners and the words“General Partner.”Corporations, cooperatives, and insurancecompanies.—The form should show the name ofthe corporation, cooperative, or insurancecompany and the signature of the president, vicepresident, treasurer, assistant treasurer, or chiefaccounting officer (such as tax officer) authorizedto sign, and his or her official title. Receivers,trustees, or assignees must sign any applicationthey are required to file. For a subsidiarycorporation filing a consolidated return with itsparent, the form should be signed by an officer ofthe parent corporation.Fiduciaries.—The-form should show the nameof the estate or trust and be signed by thefiduciary, personal representative, executor,executrix, administrator, administratrix, etc.,having legal authority to sign, and his or her title.Preparer other than partner, officer, etc.—Thesignature of the individual preparing theapplication should appear in the space providedon page"
},
{
"type": "ListItem",
@@ -261,13 +291,13 @@
},
{
"type": "NarrativeText",
- "element_id": "828767cbc922e731b59894afba55fe10",
+ "element_id": "989ff7b05e9807cf0865ac828552f045",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws and to allow us to figure and collect the right amount of tax. You are required to give us this information."
+ "text": "We ask for this information to carry out theInternal Revenue laws of the United States. Weneed it to ensure that taxpayers are complyingwith these laws and to allow us to figure andcollect the right amount of tax. You are requiredto give us this information."
},
{
"type": "Title",
@@ -291,13 +321,13 @@
},
{
"type": "NarrativeText",
- "element_id": "84e7e32f584e2ee9f47ba593bf86c559",
+ "element_id": "4a52253d27bd51d65285045e1e3e3cf1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Generally, applicants must complete Section A. In addition, complete the appropriate sections (B-1 through H) for which a change Is desired."
+ "text": "Generally, applicants must complete SectionA. In addition, complete the appropriate sections(B-1 through H) for which a change Is desired.Vinee mised evissn all palecsninte Sante ime!"
},
{
"type": "Title",
@@ -311,23 +341,23 @@
},
{
"type": "Title",
- "element_id": "af8bdf713f162b09567c8d1a3a2d4de7",
+ "element_id": "476eb0569b23e73460f08455530f0d4b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Generally, applicants must file this form within the first 180 days of the tax year in which it is desired to make the change."
+ "text": "Generally, applicants must file this form withinthe first 180 days of the tax year in which it isdesired to make the change."
},
{
"type": "Title",
- "element_id": "1df7107903f249d938fbf3710f50283a",
+ "element_id": "71b6d7f72c57641ea91dd411abdc9959",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "If the individual or firm is also authorized to represent the applicant before the IRS, receive a copy of the requested ruling, or perform any other act(s), the power of attorney must reflect such authorization(s)."
+ "text": "MIT RGSSIf the individual or firm is also authorized torepresent the applicant before the IRS, receivea copy of the requested ruling, or perform anyother act(s), the power of attorney must reflectsuch authorization(s)."
},
{
"type": "Title",
@@ -341,33 +371,33 @@
},
{
"type": "Title",
- "element_id": "242a9dba10a04654d4adef9c58ff96f6",
+ "element_id": "cd746731c7a892b0087828c0801c022b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986"
+ "text": "Changes to Accounting MethodsRequired Under the Tax Reform Actof 1986"
},
{
"type": "Title",
- "element_id": "58703de56debc34a1d68e6ed6f8fd067",
+ "element_id": "f0a757884fb918f704c1d90b762f5894",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Specific Instructions Section A"
+ "text": "Specific InstructionsSection A"
},
{
"type": "Title",
- "element_id": "12f877f0bd47f9b761ed7e74be1afacd",
+ "element_id": "d1e074ec4e3a00f9e646b34b3ff94101",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Note: /f this form is being filed in accordance with Rev. Proc. 74-11, see Section G below."
+ "text": "GIL SGNTY GRPNVaNUT.Note: /f this form is being filed in accordancewith Rev. Proc. 74-11, see Section G below."
},
{
"type": "Title",
@@ -381,13 +411,13 @@
},
{
"type": "NarrativeText",
- "element_id": "eb076cfd3d47e546c28611750afedc49",
+ "element_id": "6e1d51f920ee67d5cfb7a2600d4cb494",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Place for Filing and Late Applications. Instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(a) adjustment will be taken into account and"
+ "text": "Disregard the instructions under Time andPlace for Filing and Late Applications. Instead,attach Form 3115 to your income tax return forthe year of change; do not file it separately. Alsoinclude on a separate statement accompanyingthe Form 3115 the period over which the section481(a) adjustment will be taken into account andthe basis for that conclusion. Identify the"
},
{
"type": "Title",
@@ -401,13 +431,13 @@
},
{
"type": "NarrativeText",
- "element_id": "742730130f9c14403ad272eec208a456",
+ "element_id": "e054f522926ec7602c8380a8d7eb3296",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 1
},
- "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block."
+ "text": "Others.-—The employer identification number ofan applicant other than an individual should beentered in this block."
},
{
"type": "Title",
@@ -431,73 +461,83 @@
},
{
"type": "ListItem",
- "element_id": "b9c2a964cd107c5155ef70e5b235a05d",
+ "element_id": "f27e09e405abe4f2f2a9a28fad38974d",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "(f) provides that the term “long-term contract” means any contract for the manufacturing, building, installation, or construction of property that is not completed within the tax year in which it"
+ "text": "(f) provides that the term “long-terncontract” means any contract for themanufacturing, building, installation, orconstruction of property that is not completedwithin the tax year in which it"
},
{
"type": "ListItem",
- "element_id": "8e69cd6874d876dce416a44e695b58eb",
+ "element_id": "cf29164f7821b3a6775b230f5e247551",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "s entered into. However, a manufacturing contract will not qualify as a long-term contract unless the contract involves the manufacture of: ("
+ "text": "s entered into.However, a manufacturing contract will notqualify as a long-term contract unless thecontract involves the manufacture of: ("
},
{
"type": "ListItem",
- "element_id": "1b69bc9514700ed89e0af2872cbb95c8",
+ "element_id": "dd39fef35cb957547bd3efad8b3d6557",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": ") a unique item not normally included in your finished goods inventory, or ("
+ "text": ") aunique item not normally included in yourfinished goods inventory, or ("
},
{
"type": "ListItem",
- "element_id": "aaa7abdc10628a69ab04fcea8ecdc29d",
+ "element_id": "ae214de0f0455b7dc7212c1f815d65d4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": ") any item that normally requires more than"
+ "text": ") any item thatnormally requires more than"
},
{
"type": "ListItem",
- "element_id": "86bbefb59cb32bc6b6ff1b92e0b76d6f",
+ "element_id": "0fda0b69a885bf1425cddd8675d70be1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "calendar months to complete."
+ "text": "calendar monthto complete."
},
{
"type": "NarrativeText",
- "element_id": "1bbe995811e9fd4c3ce1b218cb641f4e",
+ "element_id": "c6b3c248ee1c921f6196a7e5cd870d67",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "(1) Gives your best estimate of the percentage of the section 481(a) adjustment that would have been required if the requested change had been made for each of the 3 preceding years; and"
+ "text": "(1) Gives your best estimate of the percentageof the section 481(a) adjustment that would havebeen required if the requested change had beenmade for each of the 3 preceding years; andVAN C."
},
{
"type": "ListItem",
- "element_id": "91057a4a80779d62b06d27fdce5da42c",
+ "element_id": "99618a049629ef4f50aeafc1a365ad75",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "All long-term contracts entered into after February"
+ "text": "AMM RIM AIAll long-term contracts entered into afte"
+ },
+ {
+ "type": "ListItem",
+ "element_id": "f7ca8476d7c8a3ac84efbd8699f97f87",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 2
+ },
+ "text": "ebruary"
},
{
"type": "ListItem",
@@ -511,33 +551,33 @@
},
{
"type": "ListItem",
- "element_id": "7613695d576752ab22ae7c02866cf1e3",
+ "element_id": "b60ab3f42291035b6184fde93a3b9230",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": ", except for real property construction contracts expected to be completed within"
+ "text": ", except for real propertyconstruction contracts expected to be completedwithin"
},
{
"type": "ListItem",
- "element_id": "a288051b2eda0f2b8d6b45647c73a1ad",
+ "element_id": "93bcd9d786ff021bed0fe0c9d71fc976",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "years by contractors whose average annual gross receipts for the"
+ "text": "years by contractors whose averageannual gross receipts for the"
},
{
"type": "ListItem",
- "element_id": "f2923844fb3e4992f1c6ddd808867d96",
+ "element_id": "9ff4779aaab33521b8398aeb72f613c0",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "prior tax years do not exceed $"
+ "text": "prior tax years donot exceed $"
},
{
"type": "ListItem",
@@ -561,43 +601,43 @@
},
{
"type": "ListItem",
- "element_id": "4df00d9659b3bfaac5990114275c4bf5",
+ "element_id": "1e970967cee7e2aa31666b6108587f35",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": ", must be accounted for using either the percentage of completion- capitalized cost method or the percentage of completion method. See section"
+ "text": ", must be accounted forusing either the percentage of completion-capitalized cost method or the percentage ofcompletion method. See section"
},
{
"type": "ListItem",
- "element_id": "dcf589bb37d079ecce4b375abc332606",
+ "element_id": "070baf413b0aca84064c63f5afaf041e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Caution: At the time these instructions were printed, Congress was considering legislation that would repeal the use of the percentage of completion-capitalized cost method for certain long-term contracts."
+ "text": "Caution: At the time these instructions wereprinted, Congress was considering legislation thatwould repeal the use of the percentage ofcompletion-capitalized cost method for certainlong-term contracts."
},
{
"type": "NarrativeText",
- "element_id": "2de8f0b5003bcb8c12a4dc59c8e1f740",
+ "element_id": "6db00b1816c20e862ee46d0de12e17fa",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "See section 5.06(2) of Rev. Proc. 84-74 for the required perjury statement that must be attached."
+ "text": "—_———eeeerorT eeeSee section 5.06(2) of Rev. Proc. 84-74 for therequired perjury statement that must beattached."
},
{
"type": "NarrativeText",
- "element_id": "751abc8c6a0fa412c3e8c18345f57f95",
+ "element_id": "db1cb1f9a7219a27df1875b2cfd5475c",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Item 13, page 2.—Insert the actual number of tax years. Use of the term “since inception” 1s not acceptable. However, “more than 6 years” Is acceptable."
+ "text": "TE RIG TINEME FN eke!Item 13, page 2.—Insert the actual number oftax years. Use of the term “since inception” 1s notacceptable. However, “more than 6 years” Isacceptable."
},
{
"type": "Title",
@@ -641,23 +681,23 @@
},
{
"type": "Title",
- "element_id": "4688916bf1d6b205af02a0e954156688",
+ "element_id": "6ccbf93cd42f38f04abdba8a103c8350",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "Limitation on the Use of the Cash Method of Accounting. —Except as provided below, C"
+ "text": "Limitation on the Use of the Cash Method ofAccounting. —Except as provided below, C"
},
{
"type": "NarrativeText",
- "element_id": "aaf93c2be8f4f2db87bd760783fedfa5",
+ "element_id": "851830b0996c633165de287a96eb0aa4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "corporations, partnerships with a C corporation as a partner, and tax shelters may not use the cash method of accounting. For purposes of this limitation, a trust subject to the tax on unrelated business income under section 511 1s treated as aC corporation with respect to its unrelated trade or business activities."
+ "text": "corporations, partnerships with a C corporationas a partner, and tax shelters may not use thecash method of accounting. For purposes of thislimitation, a trust subject to the tax on unrelatedbusiness income under section 511 1s treated asaC corporation with respect to its unrelated tradeor business activities."
},
{
"type": "Title",
@@ -671,33 +711,33 @@
},
{
"type": "NarrativeText",
- "element_id": "e5bed7fe04dd22cabe5e5c0362d37743",
+ "element_id": "54f2708b4cfb39e6586ec74244fe7f1e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "The limitation on the use of the cash method (except for tax shelters) does not apply to—"
+ "text": "The limitation on the use of the cash method(except for tax shelters) does not apply to—"
},
{
"type": "ListItem",
- "element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca",
+ "element_id": "ca6f93345af1b79e8253b00b046b4403",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "("
+ "text": "LEXCEPt TOF Lax SNENETS) GOES NOL apply LO-——("
},
{
"type": "ListItem",
- "element_id": "e388a9c123531db35a336ca587dc1a78",
+ "element_id": "daaf36cd7c9f373f7192a7f76716cfc4",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": ") Farming businesses.—F or this purpose, the term “farming business”"
+ "text": ") Farming businesses.—F or this purpose,the term “farming business”"
},
{
"type": "ListItem",
@@ -721,13 +761,13 @@
},
{
"type": "ListItem",
- "element_id": "124f8e567bb2fc32647f9a44201e0688",
+ "element_id": "920fa4651462da72706415162fc8bc85",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "), but it also includes the raising, harvesting, or growing of trees to which section"
+ "text": "), but it also includes the raising,harvesting, or growing of trees to which section"
},
{
"type": "ListItem",
@@ -741,33 +781,33 @@
},
{
"type": "ListItem",
- "element_id": "91621b3a2068ab97aafa195a272a663e",
+ "element_id": "b4537ecf064e370911fbd07081bd5bc7",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": ") applies. Notwithstanding this exception, section"
+ "text": ") applies. Notwithstanding thisexception, section"
},
{
"type": "ListItem",
- "element_id": "70360b86614c25f67ca8959ac00d5389",
+ "element_id": "883d3cfcbe67e5a5f2ba5cf430c5129e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "requires certain C corporations and partnerships with a C corporation as a partner to use the accrual method."
+ "text": "requires certain Ccorporations and partnerships with a Ccorporation as a partner to use the accrualmethod.YAN Nal find maccamalpnansan pnenapapiocna"
},
{
"type": "NarrativeText",
- "element_id": "86d11953bb813a770ecd242ff97d4e43",
+ "element_id": "1fbc7ab18ebbfd6edfcbe19b4d5a84cd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "If you are making an election under section 458, show the applicable information under Regulations section 1.458-10."
+ "text": "If you are making an election under section458, show the applicable information underRegulations section 1.458-10."
},
{
"type": "Title",
@@ -781,12 +821,12 @@
},
{
"type": "NarrativeText",
- "element_id": "0607edfa2419dd0cdc80f457872fe238",
+ "element_id": "3e5744a95d40d31aed481a28b3859577",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 2
},
- "text": "(2) Qualified personal service corporations. — A “qualified personal service corporation” is any corporation: (a) substantially all of the activities of which involve the performance of services in the fields of health, law,"
+ "text": "(2) Qualified personal service corporations. —A “qualified personal service corporation” is anycorporation: (a) substantially all of the activitiesof which involve the performance of services inthe fields of health, law, engineering,"
}
]
\ No newline at end of file
diff --git a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json
index 09e9c5722..67312ec63 100644
--- a/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json
+++ b/test_unstructured_ingest/expected-structured-output/azure/IRS-form-1987.png.json
@@ -1,17 +1,17 @@
[
{
"type": "Title",
- "element_id": "88591a76b54e47215c0827ae8838ec13",
+ "element_id": "0c4e18d78e721c8179f3946b75b17d15",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Instructions for Form 3115 (Rev. November 1987)"
+ "text": "Instructions for Form 3115 (Rev. November 1987) Annlicatinn far Chance in Accounting Mathond"
},
{
"type": "NarrativeText",
- "element_id": "766cf1d1243ef2cdbb0db5ad32d7f9c9",
+ "element_id": "41f3d9c83b2b4679195c9796134fd8f5",
"metadata": {
"data_source": {},
"filetype": "image/png",
@@ -21,7 +21,7 @@
},
{
"type": "ListItem",
- "element_id": "36a565493a214d3f7e7f24794c1dc7f4",
+ "element_id": "97968e4ba14bd2d082a70ec61ef2d9b1",
"metadata": {
"data_source": {},
"filetype": "image/png",
@@ -111,7 +111,7 @@
},
{
"type": "ListItem",
- "element_id": "59bc2945a7f606bd5078bac3bc1199d4",
+ "element_id": "f0d2beb7f43493694a91137e8e65b5f3",
"metadata": {
"data_source": {},
"filetype": "image/png",
@@ -121,7 +121,7 @@
},
{
"type": "ListItem",
- "element_id": "5157d731aa6a97c9b166799db2295bce",
+ "element_id": "13f2a282f705590fbe7b6ce15b08862a",
"metadata": {
"data_source": {},
"filetype": "image/png",
@@ -141,7 +141,7 @@
},
{
"type": "ListItem",
- "element_id": "34b66452ca63c465c69d849e4acf6d46",
+ "element_id": "9820f79275e683f5afe3f2f1283de4ca",
"metadata": {
"data_source": {},
"filetype": "image/png",
@@ -161,7 +161,7 @@
},
{
"type": "ListItem",
- "element_id": "b0fa5aaff0cee8574822dd8ac6537c06",
+ "element_id": "a98378f4a88db65dff42b7d8bd75be92",
"metadata": {
"data_source": {},
"filetype": "image/png",
@@ -181,7 +181,7 @@
},
{
"type": "ListItem",
- "element_id": "13f155c0754434406190f3cf49c82c3c",
+ "element_id": "3cb57c50002187a715e1c5048e643c65",
"metadata": {
"data_source": {},
"filetype": "image/png",
@@ -201,33 +201,33 @@
},
{
"type": "ListItem",
- "element_id": "178d6933ed193747b1c4aa1c048e7f94",
+ "element_id": "beeb50db70ce1aa76813cce98e46bd56",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "for these changes."
+ "text": "for these changes. Tb od Db bee Cl"
},
{
"type": "NarrativeText",
- "element_id": "7685df2334a5f6c8c8099dea61a8f1b4",
+ "element_id": "640a100da1a3bee6f1f134c51a2c8648",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed."
+ "text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed"
},
{
"type": "Title",
- "element_id": "61ed58fa51293f429f87e8cf1896c9e4",
+ "element_id": "a232d246e22a4f6bb8dcab62cffb2567",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Paperwork Reduction Act Notice"
+ "text": "Paperwork Reduction Act Notice We ack for thic infarenatinn te marry mye the."
},
{
"type": "Title",
@@ -241,27 +241,37 @@
},
{
"type": "ListItem",
- "element_id": "5f8051f8010896bab02aaf784c04ae02",
+ "element_id": "58f1649a32eda8b8c513e51a209666a6",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page"
+ "text": "Signature Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page"
+ },
+ {
+ "type": "ListItem",
+ "element_id": "586e989b479e4362ebe28a6954c1427b",
+ "metadata": {
+ "data_source": {},
+ "filetype": "image/png",
+ "page_number": 1
+ },
+ "text": "If the individual or firm is also authorized to"
},
{
"type": "NarrativeText",
- "element_id": "4660422c06dddc914ab634c5e4045dec",
+ "element_id": "446ccb7d96fea659d50aef8a6dd670df",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the nght amount of tax. You are required to give us this information."
+ "text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the right amount of tax. You are required to give us this information,"
},
{
"type": "Title",
- "element_id": "a1547a4ed1611eee44b15e99120fb978",
+ "element_id": "226fa83297914d5195e002508d61fb1d",
"metadata": {
"data_source": {},
"filetype": "image/png",
@@ -271,77 +281,77 @@
},
{
"type": "Title",
- "element_id": "68a3289177b49b285e133a5267eb355f",
+ "element_id": "f0e951e5bcb4a6070fa6672b37822348",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Purpose of Form"
+ "text": "Purpose of Form Cin bce Secon te cece cget."
},
{
"type": "NarrativeText",
- "element_id": "f9b8e17da7a31507773f78959378e09c",
+ "element_id": "5e5451e052baf894b2bdad4132f6cd2f",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "File this form to request a change in your accounting method, including the accounting treatment of any item. if you are requesting 2 change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods,"
+ "text": "ee File this form to request a change in your accounting method, including the accounting treatment of any item. if you are requesting 2 change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods,"
},
{
"type": "NarrativeText",
- "element_id": "b3859f2f29884b1d3ba0892e52859a99",
+ "element_id": "cc1701e3ce9347e344b3df80d426bd21",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)"
+ "text": "Seti aes When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)"
},
{
"type": "NarrativeText",
- "element_id": "e5a95dc10d4071983b70898a21f11175",
+ "element_id": "b81dc18d0f8666f9bf7400a00657dc72",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Generally, applicants must complete Section ‘A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired."
+ "text": "POMS SANE OPFOR DA 29). Generally, applicants must complete Section ‘A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired. You must give alll relevant facts, including a"
},
{
"type": "Title",
- "element_id": "5756fb398995bb6518a87637f24f426e",
+ "element_id": "c7502aa5b000d6446f3eca882518a260",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Time and Place for Filing"
+ "text": "Time and Place for Filing amarall, ammlimeete maet file snete"
},
{
"type": "NarrativeText",
- "element_id": "25f830e7c39c115c9937eb9d11cfb1f2",
+ "element_id": "8b35e7c212710b1099b675ce9394fb47",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "State whether you desire a conference in the National Office if the Service proposes to disapprove your application"
+ "text": "Se NB ON State whether you desire a conference in the National Office if the Service proposes to disapprove your application."
},
{
"type": "Title",
- "element_id": "8b06cd6e2bf7fc15130d5d9ed7e66283",
+ "element_id": "0a16a0fea889be77576c0fd88575554a",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Affiliated Groups"
+ "text": "Affiliated Groups Tavmayare that ara mam)"
},
{
"type": "Title",
- "element_id": "242a9dba10a04654d4adef9c58ff96f6",
+ "element_id": "68b58298cabd9069c975b192a7183139",
"metadata": {
"data_source": {},
"filetype": "image/png",
@@ -351,62 +361,62 @@
},
{
"type": "Title",
- "element_id": "11c98a9cbd6a200fbc5b93fed15007ac",
+ "element_id": "6a8881a6e87021b2362243f7df3e4b1d",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Uniform capitalization rules and limitation on"
+ "text": "Uniform capitalization rules and limitation on cash method.—If you are required to char"
},
{
"type": "Title",
- "element_id": "58703de56debc34a1d68e6ed6f8fd067",
+ "element_id": "8daeb8b48fb666f1dd54e2af283d0c22",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Specific Instructions Section A"
+ "text": "Specific Instructions Section A Neem Ea mama 1 !Taeahle inemes"
},
{
"type": "Title",
- "element_id": "a4316c02df07840f1beb56609cb09735",
+ "element_id": "09203a0c6955f64ca8eb52cd6ea47034",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Late Applications"
+ "text": "Late Applications Me coup armlimatinm te ler"
},
{
"type": "NarrativeText",
- "element_id": "39458f370b98a606db29ac6dee975e07",
+ "element_id": "962e3f0ceb1f0b1b08a1c19adde8d962",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and"
+ "text": "lethal elaine bela Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and the basis for that conclusion. Identify the"
},
{
"type": "Title",
- "element_id": "025a65465b6fd9635316e92633b24c7e",
+ "element_id": "bfe98eb672d95c15a11ed3e618928b4e",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Identifying Number"
+ "text": "Identifying Number Ndiuidesale Am omptisoehesal"
},
{
"type": "NarrativeText",
- "element_id": "9240bfa889b87dc2fb3fa746ca4eeeb4",
+ "element_id": "87f8128b03a72c616ee1a1bb91e11c56",
"metadata": {
"data_source": {},
"filetype": "image/png",
"page_number": 1
},
- "text": "Others.-—The employer identification number of an applicant other than an individual should be entered in this block,"
+ "text": "—e—e—— eee Others.-—The employer identification number of an applicant other than an individual should be entered in this block,"
}
]
\ No newline at end of file
diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
index bd33d781b..f9bb6e326 100644
--- a/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-api/65/11/main.PMC6312790.pdf.json
@@ -1111,13 +1111,13 @@
},
{
"type": "FigureCaption",
- "element_id": "27b45633a0f31b9e01d179d70d7dc282",
+ "element_id": "b5ee6af3d776b0bbd2e581a3ab2ab2e1",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "5 1 os = — 10; =o ° © —\" 205 i —~é é —ip a5 — Control -2 — & 2.5 T T T 0.0000001 + —-0.00001 0.001 O14 Current Density (A/cm2)"
+ "text": "Potential (Vv)nm°in°}aryT T T0.00001 0.001 olCurrent Density (A/cm2)"
},
{
"type": "UncategorizedText",
@@ -1141,13 +1141,13 @@
},
{
"type": "Table",
- "element_id": "9270ab0a1b3ba26a16991abcd0b45dfe",
+ "element_id": "e2ed41967a486766ad6a122cc3aba4d5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "Inhibitor be (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm?) Polarization Corrosion concentration (g) resistance (Q) rate (mm/year) oO 0.0335 0.0409 0.0003 24.0910 2.8163 2 1.9460 0.0596 0.0002 121.440 1.5054 4 0.0163 0.2369 0.0001 42.121 0.9476 6 0.3233 0.0540 5.39E-05 373.180 0.4318 8 0.1240 0.0556 5.46E-05 305.650 0.3772 10 0.0382 0.0086 1.24E-05 246.080 0.0919"
+ "text": "Inhibitorconcentration (g) bc (V/dec) ba (V/dec) Ecorr (V) icorr (A/cm2) Polarizationresistance (Ω) Corrosionrate (mm/year) 0246810 0.03351.94600.01630.32330.12400.0382 0.04090.05960.23690.05400.05560.0086 (cid:3) 0.9393(cid:3) 0.8276(cid:3) 0.8825(cid:3) 0.8027(cid:3) 0.5896(cid:3) 0.5356 0.00030.00020.00015.39E-055.46E-051.24E-05 24.0910121.44042.121373.180305.650246.080 2.81631.50540.94760.43180.37720.0919"
},
{
"type": "UncategorizedText",
@@ -1471,13 +1471,13 @@
},
{
"type": "FigureCaption",
- "element_id": "273fb301b173075f79b2cbdab962e2ff",
+ "element_id": "6959a323ee23c858c3b1411b05db6ebf",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5
},
- "text": "SEM HV: Q0KY WD: 14.89 rmrm ‘9EM MAO: 209 x Det: DOE Pectomsence In nanospact"
+ "text": "SEM HV: Q0KY WD: 14.89 rmrm‘DEM MAO: 209 x Det: DOE Pecforsence In nenospact"
},
{
"type": "NarrativeText",
@@ -1491,13 +1491,13 @@
},
{
"type": "FigureCaption",
- "element_id": "d04d110c16a4ebc184fa130f09b8d423",
+ "element_id": "4f8c25cf7aefbef4af474fe62bed2b33",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5
},
- "text": "Sem ny. 200 Rv"
+ "text": "gEoswaeSem ny. 200 RvLitt td vegas rescanFertormarce innancesacel"
},
{
"type": "NarrativeText",
@@ -1511,13 +1511,13 @@
},
{
"type": "FigureCaption",
- "element_id": "520d1da08c86ce165cd2843e2dc27f98",
+ "element_id": "035c30f23285fdae72335b94421cf564",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5
},
- "text": "SEMHV: 20.0KV WD: 15.54 mm EM ING: ACO x Dei: OSE"
+ "text": "°@¢Naafe«MgsSEM HY: 20.0KV ALEC CMT LPL LL A a pO OPEM ING: ACO x"
},
{
"type": "NarrativeText",
@@ -1579,16 +1579,6 @@
},
"text": "Austenitic stainless steel Type 316 was used in this study with chemical composition reported in [1,2]. The chemicals used were of annular grade. The inhibitor concentrations are in the range of 2, 4, 6, 8 and 10 g [3–5]. The structural formula of egg shell powder is shown in Fig. 9."
},
- {
- "type": "FigureCaption",
- "element_id": "060e14f01e484ba252e902cd5c6f94f9",
- "metadata": {
- "data_source": {},
- "filetype": "application/pdf",
- "page_number": 6
- },
- "text": "ou H,;COCHNY OH"
- },
{
"type": "UncategorizedText",
"element_id": "c07eeb615f8b0f2d544348b7f0655301",
diff --git a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
index dc329e930..759a2e665 100644
--- a/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/biomed-api/75/29/main.PMC6312793.pdf.json
@@ -791,13 +791,13 @@
},
{
"type": "Table",
- "element_id": "1d8fd023cd0978f7a6500815d2ad0ef6",
+ "element_id": "be8fbf813482eec7fd0e2fc665b4d3bb",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 3
},
- "text": "Instance size (m, n) Average number of Locations Times Vehicles Possible empty travels (8, 1500) 568.40 975.20 652.20 668,279.40 (8, 2000) 672.80 1048.00 857.20 1,195,844.80 (8, 2500) 923.40 1078.00 1082.40 1,866,175.20 (8, 3000) 977.00 1113.20 1272.80 2,705,617.00 (12, 1500) 566.00 994.00 642.00 674,191.00 (12, 2000) 732.60 1040.60 861.20 1,199,659.80 (12, 2500) 875.00 1081.00 1096.00 1,878,745.20 (12, 3000) 1119.60 1107.40 1286.20 2,711,180.40 (16, 1500) 581.80 985.40 667.80 673,585.80 (16, 2000) 778.00 1040.60 872.40 1,200,560.80 (16, 2500) 879.00 1083.20 1076.40 1,879,387.00 ) (16, 3000 1087.20 1101.60 1284.60 2,684,983.60"
+ "text": "Instance size (m, n) Average number of (8, 1500)(8, 2000)(8, 2500)(8, 3000)(12, 1500)(12, 2000)(12, 2500)(12, 3000)(16, 1500)(16, 2000)(16, 2500)(16, 3000) Locations Times Vehicles Possible empty travels 568.40672.80923.40977.00566.00732.60875.001119.60581.80778.00879.001087.20 975.201048.001078.001113.20994.001040.601081.001107.40985.401040.601083.201101.60 652.20857.201082.401272.80642.00861.201096.001286.20667.80872.401076.401284.60 668,279.401,195,844.801,866,175.202,705,617.00674,191.001,199,659.801,878,745.202,711,180.40673,585.801,200,560.801,879,387.002,684,983.60"
},
{
"type": "UncategorizedText",
diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
index 4c7827658..1ea6ad01a 100644
--- a/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/local-single-file-with-pdf-infer-table-structure/layout-parser-paper.pdf.json
@@ -591,13 +591,13 @@
},
{
"type": "FigureCaption",
- "element_id": "812dcaaec927a84d57af36e20adb5ded",
+ "element_id": "dd23a7c381d44f4b36975adaf4d2236d",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 4
},
- "text": "Efficient Data Annotation Model Customization Document Images Community Platform ‘a >) ¥ DIA Model Hub i .) Customized Model Training] == | Layout Detection Models | ——= DIA Pipeline Sharing ~ OCR Module = { Layout Data stuctue ) = (storage Visualization VY"
+ "text": "Model Customization Document Images Community PlatformEfficient Data Annotation ¥ DIA Model HubCustomized Model Training) ==> | Layout Detection wot) = | DIA Pipeline Sharing }4( OCR Module ) = { Layout Data stuctue ) = (storages vsatzaion )"
},
{
"type": "NarrativeText",
@@ -681,14 +681,14 @@
},
{
"type": "Table",
- "element_id": "34923b77ca76e1808956ade5e766f7c2",
+ "element_id": "71e289a268220c21575bb55a73980b83",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 5,
"text_as_html": "
Dataset | | Base Model'| | | Notes | PubLayNet | [38] F/M | Layouts of modern scientific documents |
PRImA [3] | M | Layouts of scanned modern magazines and scientific reports |
Newspaper | F | Layouts of scanned US newspapers from the 20th century |
TableBank | F | Table region on modern scientific and business document |
HJDataset [31] | F/M | Layouts of history Japanese documents |
"
},
- "text": "Dataset | Base Model'| Large Model | Notes PubLayNet B8]| F/M M Layouts of modern scientific documents PRImA M - nned modern magazines and scientific reports Newspapei F - canned US newspapers from the 20th century TableBank F F Table region on modern scientific and business document HJDataset F/M - Layouts of history Japanese documents"
+ "text": "Dataset Base Model1 Large Model Notes PubLayNet [38]PRImA [3]Newspaper [17]TableBank [18]HJDataset [31] F / MMFFF / M M--F- Layouts of modern scientific documentsLayouts of scanned modern magazines and scientific reportsLayouts of scanned US newspapers from the 20th centuryTable region on modern scientific and business documentLayouts of history Japanese documents"
},
{
"type": "UncategorizedText",
@@ -852,13 +852,13 @@
},
{
"type": "FigureCaption",
- "element_id": "d21661161ae2c8dc39e96ee5c660704b",
+ "element_id": "2f498bdd91739a7083490999507420a5",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 6
},
- "text": "- ° . 3 a a 4 a 3 oo er ‘ 2 § 8 a 8 3 3 ‘ £ 4 A g a 9 ‘ 3 ¥ Coordinate g 4 5 3 + § 3 H Extra Features [O=\") [Bo] eaing i Text | | Type | | ower ° & a ¢ o [ coordinatel textblock1, 3 3 ’ g Q 3 , textblock2 , layoutl ] 4 q ® A list of the layout elements Ff"
+ "text": "33§3 fectange vada8883 Coordinate83 +*Block | [Block | [Read8 Extra features Tet | [Tye | [oder[ coordinatel textblock1 |» , see383 , textblock2 , layout] ]4A list of the layout elementsThe same transformation and operation APIs"
},
{
"type": "NarrativeText",
@@ -1062,14 +1062,14 @@
},
{
"type": "Table",
- "element_id": "f81d4915b54758e0d4d52af3566bb813",
+ "element_id": "548c38f86edc295baf869abe37a0d1cf",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 8,
"text_as_html": "Operation Name | | | | Description | block.pad(top, bottom, | right, | left) | | Enlarge the current block according to the input |
block.scale(fx, fy) | | | Scale the current block given the ratio in x and y direction |
block.shift(dx, dy) | | | Move the current block with the shift distances in x and y direction |
blocki.is_in(block2) | | | | Whether block] is inside of block2 |
blocki.intersect (block2) | | | Return the intersection region of blockl and block2. Coordinate type to be determined based on the inputs. |
block1i.union(block2) | | | Return the union region of blockl and block2. Coordinate type to be determined based on the inputs. |
blocki.relative_to(block2) | | | Convert the absolute coordinates of block] to relative coordinates to block2 |
blocki.condition_on(block2) | | | Calculate the absolute coordinates of blockl given the canvas block2’s absolute coordinates |
"
},
- "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) Scale the current block given the ratio ion in x and y di block.shift(dx, dy) Move the current block with the shift distances in x and y direction block1.is_in(block2) Whether block] is inside of block2 ; Return the intersection region of block and block2. block1. intersect (block2) . . . Coordinate type to be determined based on the inputs. ; Return the union region of block1 and block2. block1.union(block2) . . . Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block to block1.relative_to(block2) ' ' relative coordinates to block2 . Calculate the absolute coordinates of block1 given block1.condition_on(block2) . the canvas block2’s absolute coordinates block. crop_image (image) Obtain the image segments in the block region"
+ "text": "Operation Name Description block.pad(top, bottom, right, left) Enlarge the current block according to the input block.scale(fx, fy) block.shift(dx, dy) Scale the current block given the ratioin x and y direction Move the current block with the shiftdistances in x and y direction block1.is in(block2) Whether block1 is inside of block2 block1.intersect(block2) block1.union(block2) block1.relative to(block2) block1.condition on(block2) Return the intersection region of block1 and block2.Coordinate type to be determined based on the inputs. Return the union region of block1 and block2.Coordinate type to be determined based on the inputs. Convert the absolute coordinates of block1 torelative coordinates to block2 Calculate the absolute coordinates of block1 giventhe canvas block2’s absolute coordinates Obtain the image segments in the block region"
},
{
"type": "UncategorizedText",
@@ -1333,13 +1333,13 @@
},
{
"type": "FigureCaption",
- "element_id": "975d6cb141cb0a0313375630ae063fa8",
+ "element_id": "d10d70e48ed0066bd15dd133d09f61fd",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 9
},
- "text": "x09 Burpunog uayor Aeydsiq 1 vondo 10g Guypunog usyoy apir:z uondo Mode I: Showing Layout on the Original Image Mode Il: Drawing OCR'd Text at the Correspoding Position"
+ "text": "a ESMode I: Showing Layout on the Original ImageayeMode Il: Drawing OCR'd Text at the Correspoding Position10g Bpunog vayoy feyds1q :1 vondo‘xog Burpunog vay apiH z word"
},
{
"type": "NarrativeText",
@@ -1423,13 +1423,13 @@
},
{
"type": "FigureCaption",
- "element_id": "2680b3c7a55754a3ba2738cb3d9d5e8b",
+ "element_id": "0ff9ad06a304818ae83b93c6f2b16309",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 10
},
- "text": "et Intra-column reading order Token Categories tie (Adress 2) tee (NE sumber Variable HEE company type Column Categories (J tite we) adaress —_ (7) section Header by ‘e * Column reading order a a (a) Illustration of the original Japanese Maximum Allowed Height BRE B>e EER eR (b) Illustration of the recreated document with dense text structure for better OCR performance"
+ "text": "Column reading orderMaximum Allowed HeightZ. Shen et al.Intra-column reading ordert el 1 ili |.(a) Illustration of the original Japanese document with detected layout elements highlighted in colored boxesCe tans &iia! aaae oeRRbaeienases(b) Illustration of the recreated document with dense text structure for better OCR performance‘Token CategoriesMoteAddresstetNumberVaribiecompany typeColumn Categories(J tite| Aatress(tee[7] section adr"
},
{
"type": "NarrativeText",
@@ -1543,13 +1543,13 @@
},
{
"type": "FigureCaption",
- "element_id": "b33b2bc3b9c416673c7f74c6a00c49d8",
+ "element_id": "55f2474c66877608ca9b463a7076573e",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 11
},
- "text": "(spe peepee, ‘Active Learning Layout Annotate Layout Dataset | + ‘Annotation Toolkit ¥ a Deep Leaming Layout Model Training & Inference, ¥ ; Handy Data Structures & Post-processing El Apis for Layout Det a LAR ror tye eats) 4 Text Recognition | <—— Default ane Customized ¥ ee Layout Structure Visualization & Export | <—— | visualization & Storage The Japanese Document Helpful LayoutParser Digitization Pipeline Modules"
+ "text": "(spe peepee,Active Learning Layout=Annotate Layout Dataset parte4zi Deep Learning LayoutLayout Detection Model Training & Inference,Post-processin Handy Data Structures &pl 9 APIs for Layout DataText Recognition Default and Customized: r OCR Models4Visualization & Export | <——Layout StructureVisualization & StorageThe Japanese Document Helpful LayoutParserDigitization Pipeline Modules"
},
{
"type": "UncategorizedText",
@@ -1723,13 +1723,13 @@
},
{
"type": "FigureCaption",
- "element_id": "7d42bb6af1404a95a6e8870d5c4d07bf",
+ "element_id": "f58d47bde7ebddd81c4a678c918a8f1b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 13
},
- "text": "(@) Partial table at the bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line"
+ "text": "(2) Partial table atthe bottom (&) Full page table (6) Partial table at the top (d) Mis-detected tet line"
},
{
"type": "NarrativeText",
diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
index 89ce864c3..884200e49 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
@@ -721,13 +721,13 @@
},
{
"type": "Table",
- "element_id": "63bdc79def2500227001ac95d78727ab",
+ "element_id": "8dec233e9bc75c7256a28a899794709b",
"metadata": {
"data_source": {},
"filetype": "application/pdf",
"page_number": 7
},
- "text": "Difference from October 2022 Q4 over Q4 2/ Estimate___ Projections WEO Projections 1/ Estimate Projections 2021 2022 2023 2024 2023 2024 2022 2023 2024 World Output 6.2 34 29 34 0.2 0.1 1.9 3.2 3.0 Advanced Economies 5.4 27 1.2 14 04 0.2 1.3 14 1.6 United States 5.9 2.0 14 1.0 04 -0.2 07 1.0 13 Euro Area 5.3 3.5 07 16 0.2 -0.2 19 0.5 24 Germany 26 19 01 14 04 0.1 14 0.0 23 France 68 26 07 16 0.0 0.0 0.5 09 18 Italy 67 3.9 06 0.9 08 -04 21 0.1 1.0 Spain 5.5 5.2 14 24 -0.1 -0.2 21 13 28 Japan 21 14 18 0.9 0.2 -04 17 1.0 1.0 United Kingdom 76 41 -06 0.9 -0.9 03 04 -05 18 Canada 5.0 3.5 15 15 0.0 0.1 23 12 1.9 Other Advanced Economies 3/ 5.3 28 20 24 -03 02 14 2a 2.2 Emerging Market and Developing Economies 67 3.9 40 42 0.3 -0.1 25 5.0 4A Emerging and Developing Asia 74 43 5.3 5.2 04 0.0 3.4 6.2 49 China 84 3.0 5.2 45 08 0.0 29 5.9 41 India 4/ 87 68 61 68 0.0 0.0 43 70 7A Emerging and Developing Europe 69 07 15 26 0.9 01 -2.0 3.5 28 Russia 47 -2.2 0.3 21 26 06 441 1.0 2.0 Latin America and the Caribbean 7.0 3.9 18 2a 04 0.3 26 1.9 19 Brazil 5.0 34 12 15 0.2 -04 28 0.8 22 Mexico 47 34 47 16 05 -0.2 37 14 1.9 Middle East and Central Asia 45 5.3 3.2 37 -04 0.2 . . . Saudi Arabia 3.2 87 26 34 -11 0.5 46 27 35 Sub-Saharan Africa 47 38 38 41 04 0.0 = ao ao Nigeria 3.6 3.0 3.2 29 0.2 0.0 26 31 29 South Africa 49 26 12 13 01 0.0 3.0 0.5 18 Memorandum World Growth Based on Market Exchange Rates 6.0 3.41 24 25 03 -0.1 17 25 25 European Union 5.5 37 07 18 0.0 -0.3 18 1.2 2.0 ASEAN-5 5/ 3.8 5.2 43 47 0.2 -0.2 37 57 40 Middle East and North Africa 41 54 3.2 35 -04 0.2 a . . Emerging Market and Middle-Income Economies 70 38 40 44 04 0.0 25 5.0 44 Low-Income Developing Countries 441 49 49 56 0.0 01 World Trade Volume (goods and services) 6/ 10.4 5.4 24 3.4 -01 -0.3 Advanced Economies 94 66 23 27 0.0 -04 Emerging Market and Developing Economies 124 34 26 46 03 0.0 Commodity Prices Oil 7/ 65.8 39.8 -16.2 71 33 -0.9 11.2 -98 59 Nonfuel (average based on world commodity import weights) 26.4 70 -6.3 -0.4 -01 03 -2.0 14 -0.2 World Consumer Prices 8/ 47 88 6.6 43 04 0.2 9.2 5.0 3.5 Advanced Economies 9/ 34 73 46 26 0.2 02 78 31 23 Emerging Market and Developing Economies 8/ 5.9 99 84 5.5 0.0 02 10.4 66 45,"
+ "text": "Estimate2022 Projections 2023 2024 2021 WEO Projections 1/ 2023 2024 Estimate2022 Projections 2023 2024 Difference from October 2022 Q4 over Q4 2/ World Output Advanced Economies United States Euro Area Germany France Italy Spain Japan United Kingdom Canada Other Advanced Economies 3/ Emerging Market and Developing Economies Emerging and Developing Asia China India 4/ Emerging and Developing Europe Russia Latin America and the Caribbean Brazil Mexico Middle East and Central Asia Saudi Arabia Sub-Saharan Africa Nigeria South Africa Memorandum World Growth Based on Market Exchange Rates European Union ASEAN-5 5/ Middle East and North Africa Emerging Market and Middle-Income Economies Low-Income Developing Countries World Trade Volume (goods and services) 6/ Advanced Economies Emerging Market and Developing Economies Commodity Prices Oil 7/ Nonfuel (average based on world commodity import weights) World Consumer Prices 8/ Advanced Economies 9/ Emerging Market and Developing Economies 8/ 6.2 5.4 5.9 5.3 2.6 6.8 6.7 5.5 2.1 7.6 5.0 5.3 6.7 7.4 8.4 8.7 6.9 4.7 7.0 5.0 4.7 4.5 3.2 4.7 3.6 4.9 6.0 5.5 3.8 4.1 7.0 4.1 10.4 9.4 12.1 65.8 26.4 4.7 3.1 5.9 3.4 2.7 2.0 3.5 1.9 2.6 3.9 5.2 1.4 4.1 3.5 2.8 3.9 4.3 3.0 6.8 0.7 –2.2 3.9 3.1 3.1 5.3 8.7 3.8 3.0 2.6 3.1 3.7 5.2 5.4 3.8 4.9 5.4 6.6 3.4 39.8 7.0 8.8 7.3 9.9 2.9 1.2 1.4 0.7 0.1 0.7 0.6 1.1 1.8 –0.6 1.5 2.0 4.0 5.3 5.2 6.1 1.5 0.3 1.8 1.2 1.7 3.2 2.6 3.8 3.2 1.2 2.4 0.7 4.3 3.2 4.0 4.9 2.4 2.3 2.6 3.1 1.4 1.0 1.6 1.4 1.6 0.9 2.4 0.9 0.9 1.5 2.4 4.2 5.2 4.5 6.8 2.6 2.1 2.1 1.5 1.6 3.7 3.4 4.1 2.9 1.3 2.5 1.8 4.7 3.5 4.1 5.6 3.4 2.7 4.6 –16.2 –6.3 6.6 4.6 8.1 –7.1 –0.4 4.3 2.6 5.5 0.2 0.1 0.4 0.2 0.4 0.0 0.8 –0.1 0.2 –0.9 0.0 –0.3 0.3 0.4 0.8 0.0 0.9 2.6 0.1 0.2 0.5 –0.4 –1.1 0.1 0.2 0.1 0.3 0.0 –0.2 –0.4 0.4 0.0 –0.1 0.0 –0.3 –3.3 –0.1 0.1 0.2 0.0 –0.1 –0.2 –0.2 –0.2 –0.1 0.0 –0.4 –0.2 –0.4 0.3 –0.1 –0.2 –0.1 0.0 0.0 0.0 0.1 0.6 –0.3 –0.4 –0.2 0.2 0.5 0.0 0.0 0.0 –0.1 –0.3 –0.2 0.2 0.0 0.1 –0.3 –0.4 0.0 –0.9 0.3 0.2 0.2 0.2 1.9 1.3 0.7 1.9 1.4 0.5 2.1 2.1 1.7 0.4 2.3 1.4 2.5 3.4 2.9 4.3 –2.0 –4.1 2.6 2.8 3.7 . . . 4.6 . . . 2.6 3.0 1.7 1.8 3.7 . . . 2.5 . . . . . . . . . . . . 11.2 –2.0 9.2 7.8 10.4 3.2 1.1 1.0 0.5 0.0 0.9 0.1 1.3 1.0 –0.5 1.2 2.1 5.0 6.2 5.9 7.0 3.5 1.0 1.9 0.8 1.1 . . . 2.7 . . . 3.1 0.5 2.5 1.2 5.7 . . . 5.0 . . . . . . . . . . . . 3.0 1.6 1.3 2.1 2.3 1.8 1.0 2.8 1.0 1.8 1.9 2.2 4.1 4.9 4.1 7.1 2.8 2.0 1.9 2.2 1.9 . . . 3.5 . . . 2.9 1.8 2.5 2.0 4.0 . . . 4.1 . . . . . . . . . . . . –9.8 1.4 5.0 3.1 6.6 –5.9 –0.2 3.5 2.3 4.5 Q4 over Q4 2/ eee na RA Estimate_ le Projections Estimate_ le Projections Cetimatel Oo [re ee~T Se 2022 — 2022 — anna es battee eesaanShaeaan eeeanne anan aaa ——eeaaa World Output 6.2 0.2 19 eveee eve nmeAdvanced EconomiesTinta Chetan 54LO 27On 14an 04nA baal1.3nz 1.649 eee meeeUnited StatesCo Ae STN taeEuro Area‘Rareram) Vee5.3 vir0.2 ENGermanyCoen ww26no 2 « Ve04An eianyFranceleah, ov68 MAE07 vir0.0 ree0.5 vv0.9 ve67ce ev3.9— ve08na vwna vee24a4 vee0.14a NaySpain vee24no aNN+ “penJapanVithed Vinmdaee, we24— wwe04a voro!United KingdomPanada et76 Vw eeCanadaAbn Ndime nnd Camnaeeinn O) vee15nA vwnn vr23“aA INOSOther Advanced Economies 3/ wu5.3 ww28 Ld20 idl24 vw0.3 ow14 te24 Lindl22 MEE REEmerging Market and Developing Economiesnee a ree nee25 }et5.0 “eae a eee eeg and Developing Asia Me74 “~~04 ail3.4 EITC TSHTS GID MIC VOIOP HY PwictChinabese at vie45no Vt08An vt29aD ineIndia 4/ oT87 liad68 ve0.0 og43 Emerging and Developing EuropeDa. 15na 264 SET IMS MeN Pig SMRussiaLatin Amarina and the Carihhaan ve47mn al26na 44on 1.040 ENONLatin America and the CaribbeanDeasil iN oN04no nA OLEAN GM NENS aieBrazilMavien is ov28a7 eenMexicoRoeldia Cast end antral Asia ue0.5na eu3.7 oe19 bichinanbaiastelMiddle East and Central AsiaCad; Apahia 45ao vat-04aa ) veeAg 7 ac ee ON EN EENSaudi ArabiaCok Gahacen Af aia3.2az uw87— Ve26— ue3.4Aa Ne14na 46 27 35 ve eeSub-Saharan AfricaAlinaria ve47 on ofa4 0 ENNNigeriaOak AEinn ve3.0ne INIQe TeSouth Africa vw49 wu26 ve12 dl1.3 Ve041 vw0.0 ov3.0 vet0.5 og18 STMemorandumUae Pereath Beene an Marat Conhanns Dates BEES GEESIAISTWorld Growth Based on Market Exchange Ratesco -041 TN EE BO VN ene ee eeEuropean UnionACCA EE! vv5.5 vl37fo Lal0.0 aurea eatASEAN-5 5/Reebde Cred ered Rbesie Adslan ond38aa ve0.2nA dad0.2— ov4.0 rawMiddle East and North Africa ve44 del-04 vee vee NS EO EN EENEmerging Market and Middle-Income Economies 25 5.0 44 ee eee, Oe eee eeeLow-Income Developing Countries 44 awe nan eee eeeWorld Trade Volume (goods and services) 6/Aduannad Enanamineneo Teh10.4oO” Te2400 uM0.1ann ON ES NOES OE DESO NEAdvanced Economieser mee9.4“ana “rt23ne Mee0.0— iawn eensEmerging Market and Developing Economies oT12.1 vw3.4 ow26 of46 vw0.3 ~~ BESTT MIGINGL ait event eSCommodity Prices=) EIN!Oil 7/be® an! fesempnses became! nun taniied ananenelieeeS 65.8me A 39.8—- -16.2— 11.2an MirNonfuel (average based on world commodity import weights) V026.4 ve.0 I-0.4 Te-0.1 we Tew14 DMI! \\arvliagy Veo nN Ay aeWorld Consumer Prices 8/Aehiannedd Kamnemnian fii ae47nA Se MESES EIS OFAdvanced Economies 9/Emarninn Marbat and Navalaninn Erancs ee34 Lda0.2 02nd oe78ana ve2.3 rE RS OTEmerging Market and Developing Economies 8/ ) 10.4 fame i i al oe vt =: Lone aNate’ Ranl affective auchanna mtoe ara aeaamnad tn ramain oanatant atthe laugle nraunlinn dinna Oeinhar dA D00_Nevemherd] O00) Fenny nine noe ct Tho an _\"iT iartearhy"
},
{
"type": "UncategorizedText",
diff --git a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json
index 0d3a1bc19..df30ef9bb 100644
--- a/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/s3/small-pdf-set/recalibrating-risk-report.pdf.json
@@ -1099,6 +1099,16 @@
},
"text": "6"
},
+ {
+ "type": "FigureCaption",
+ "element_id": "f58b520072d30c4805940f5c99a306c3",
+ "metadata": {
+ "data_source": {},
+ "filetype": "application/pdf",
+ "page_number": 9
+ },
+ "text": "an ¥3 te,ay."
+ },
{
"type": "NarrativeText",
"element_id": "d754d8d468346f652657279272a11897",
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 41643b78b..602eae017 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.10.4-dev0" # pragma: no cover
+__version__ = "0.10.4" # pragma: no cover
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
index 57196c50e..088184070 100644
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@@ -188,6 +188,7 @@ def partition_pdf_or_image(
infer_table_structure=infer_table_structure,
include_page_breaks=include_page_breaks,
ocr_languages=ocr_languages,
+ ocr_mode="individual_blocks",
metadata_last_modified=metadata_last_modified or last_modification_date,
**kwargs,
)
@@ -219,6 +220,7 @@ def _partition_pdf_or_image_local(
infer_table_structure: bool = False,
include_page_breaks: bool = False,
ocr_languages: str = "eng",
+ ocr_mode: str = "entire_page",
model_name: Optional[str] = None,
metadata_last_modified: Optional[str] = None,
**kwargs,
@@ -235,6 +237,7 @@ def _partition_pdf_or_image_local(
process_file_with_model_kwargs = {
"is_image": is_image,
"ocr_languages": ocr_languages,
+ "ocr_mode": ocr_mode,
"extract_tables": infer_table_structure,
"model_name": model_name,
}
@@ -249,6 +252,7 @@ def _partition_pdf_or_image_local(
file,
is_image=is_image,
ocr_languages=ocr_languages,
+ ocr_mode=ocr_mode,
extract_tables=infer_table_structure,
model_name=model_name,
)