Chore(ingest) : add tests on PDFs with fast strategy (#614)

Summary * Updates "fast" PDF output element ordering to be consistent across Python versions by using the X,Y coordinates of elements extracted * Added PDFs ingest tests with fast strategy with new script ./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh Updated ingest tests procedure: * Processing files with hi_res strategy, and preserve downloads to repo files-ingest-download/<ingest_test_name> * Reprocessing all PDFs with fast strategy from local file files-ingest-download, the partition outputs are stored at expected-structured-output/pdf-fast-reprocess/<ingest_test_name> Test * Reproduce tests with ./scripts/ingest-test-fixtures-update.sh , should expect no update. Also don't need any secret tokens since relevant tests won't produce PDFs.
2025-11-17 02:47:32 +00:00 · 2023-06-12 15:02:48 -04:00 · 2023-06-12 15:02:48 -04:00 · 2fbb1ccd30
commit 2fbb1ccd30
parent 3f80301964
22 changed files with 6271 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,7 +1,10 @@
-## 0.7.5-dev0
+## 0.7.5-dev1

 ### Enhancements

+* Adds functionality to sort elements in `partition_pdf` for `fast` strategy
+* Adds ingest tests with `--fast` strategy on PDF documents
+
 ### Features

 ### Fixes
--- a/docker/ubuntu-22/Dockerfile
+++ b/docker/ubuntu-22/Dockerfile
@ -20,6 +20,6 @@ RUN source ~/.bashrc && pyenv virtualenv 3.8.15 unstructured && \
    make install-ingest-azure && \
    make install-ingest-github && \
    make install-ingest-gitlab && \
-    make install-ingest-wikipedia && \ 
+    make install-ingest-wikipedia && \
    make install-ingest-discord && \
    make install install-ingest-slack
--- a/scripts/ingest-test-fixtures-update.sh
+++ b/scripts/ingest-test-fixtures-update.sh
@ -51,4 +51,5 @@ docker run --rm -v "$SCRIPT_DIR"/../unstructured:/root/unstructured \
               ./test_unstructured_ingest/test-ingest-biomed-api.sh &&
               ./test_unstructured_ingest/test-ingest-biomed-path.sh &&
               ./test_unstructured_ingest/test-ingest-s3.sh &&
-               ./test_unstructured_ingest/test-ingest-slack.sh"
+               ./test_unstructured_ingest/test-ingest-slack.sh &&
+               ./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh"
--- a/test_unstructured/partition/test_pdf.py
+++ b/test_unstructured/partition/test_pdf.py
@ -475,11 +475,11 @@ def test_partition_pdf_fast_groups_text_in_text_box():
    assert str(elements[1]).endswith("Jordan and Egypt.")

    assert elements[3] == Title(
-        "kilograms CO₂e/boe carbon intensity from our Eastern Mediterranean operations in 2022",
+        "1st",
        coordinates=(
-            (69.4871, 222.4357),
-            (69.4871, 272.1607),
-            (197.8209, 272.1607),
-            (197.8209, 222.4357),
+            (273.9929, 181.16470000000004),
+            (273.9929, 226.16470000000004),
+            (333.59990000000005, 226.16470000000004),
+            (333.59990000000005, 181.16470000000004),
        ),
    )
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/Core-Skills-for-Biomedical-Data-Scientists-2-pages.pdf.json
@ -0,0 +1,252 @@
+[
+  {
+    "element_id": "611cb5b35c8277f981fe5faaaab7b1a5",
+    "text": "Core Skills for Biomedical Data Scientists",
+    "type": "Title",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "64b2134f054446d473fce1b05d4d4c94",
+    "text": "Maryam Zaringhalam, PhD, AAAS Science & Technology Policy Fellow",
+    "type": "Title",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "a6f2309aca564ada03b7ef16e5d0e9ab",
+    "text": "Lisa Federer, MLIS, Data Science Training Coordinator",
+    "type": "Title",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "7f56b84c46cb41ebdcec2c9ac8673d72",
+    "text": "Michael F. Huerta, PhD, Associate Director of NLM for Program Development and NLM Coordinator of Data Science and Open Science Initiatives",
+    "type": "UncategorizedText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "53d548aa01fc3eb72da15a5be7f235e2",
+    "text": "Executive Summary",
+    "type": "Title",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "f14031943b3f1e34dcfc27bf02c38c09",
+    "text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs. Suggested high-level core skills include:",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "8f90f5970c85f335b1bf50af611ce5c5",
+    "text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "d1a5bb898aee8de0fbdf048c7a9fb01d",
+    "text": "2. Programming language expertise: biomedical data scientists should be fluent in at",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "18e42d24d6449a9b52fc65fc3f9710b4",
+    "text": "least one programming language (typically R and/or Python);",
+    "type": "Title",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "c6be5389b7bd00746d39b7bac468dea0",
+    "text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "1b8039583cbc15f654c89f2141eb6e10",
+    "text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science.",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "2f87757b1d497a32c077be543632ed7d",
+    "text": "5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy.",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "34b28172088bba51c6764df6d4e87674",
+    "text": "The report further details specific skills and expertise relevant to biomedical data scientists.",
+    "type": "UncategorizedText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "89b1f4c3df983454e25b233320781610",
+    "text": "Motivation",
+    "type": "Title",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "5e3d4670749a0f3753fa4bb1b328d156",
+    "text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 1
+    }
+  },
+  {
+    "element_id": "68431de56564c6ad6aa3e6c02b78c89c",
+    "text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________",
+    "type": "UncategorizedText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 2
+    }
+  },
+  {
+    "element_id": "4c5f925a7db08289f19dbe8635d8b4cd",
+    "text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce.",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 2
+    }
+  },
+  {
+    "element_id": "f26d07e6b71e42596791a241e2417931",
+    "text": "Methodology",
+    "type": "Title",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 2
+    }
+  },
+  {
+    "element_id": "bcefa2402c4d32dbf76a40451d0fc3dd",
+    "text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 2
+    }
+  },
+  {
+    "element_id": "a24acaf1cb5d6f8a0a0af0e81949765b",
+    "text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use.",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 2
+    }
+  },
+  {
+    "element_id": "8b344c7a03b0e90a794d3c4dd7de87d4",
+    "text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A.",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 2
+    }
+  },
+  {
+    "element_id": "6934158451634cb4c2a470b7734d29f6",
+    "text": "c) Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad.",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 2
+    }
+  },
+  {
+    "element_id": "91da3a0694b9cdc01c32e1d3071f3941",
+    "text": "Analysis of the above data provided insights into the current state of biomedical data science training, as well as a view into data science-related skills likely to be needed to prepare the BDS workforce to succeed in the future. Together, these analyses informed recommendations for core skills necessary for a competitive biomedical data scientist.",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 2
+    }
+  },
+  {
+    "element_id": "f39ddfa6365e505947527153b0ea60d8",
+    "text": "1 Kaggle is an online community for data scientists, serving as a platform for collaboration, competition, and learning: http://kaggle.com 2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
+    "type": "NarrativeText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 2
+    }
+  },
+  {
+    "element_id": "d4735e3a265e16eee03f59718b9b5d03",
+    "text": "2",
+    "type": "UncategorizedText",
+    "metadata": {
+      "data_source": {},
+      "filetype": "application/pdf",
+      "page_number": 2
+    }
+  }
+]
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/azure/IRS-form-1987.pdf.json
@ -0,0 +1 @@
+[]
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/2023-Jan-economic-outlook.pdf.json
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/Silent-Giant-(1).pdf.json
--- a/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json
+++ b/test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/s3/small-pdf-set/recalibrating-risk-report.pdf.json
--- a/test_unstructured_ingest/test-ingest-azure.sh
+++ b/test_unstructured_ingest/test-ingest-azure.sh
@ -11,6 +11,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --azure-account-name azureunstructured1 \
    --structured-output-dir azure-ingest-output \
    --partition-strategy hi_res \
+    --download-dir files-ingest-download/azure \
+    --preserve-downloads \
    --reprocess \
    --num-processes 2

--- a/test_unstructured_ingest/test-ingest-biomed-api.sh
+++ b/test_unstructured_ingest/test-ingest-biomed-api.sh
@ -34,6 +34,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --reprocess \
    --verbose \
    --re-download \
+    --download-dir files-ingest-download/biomed-api \
    --preserve-downloads

 OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
--- a/test_unstructured_ingest/test-ingest-biomed-path.sh
+++ b/test_unstructured_ingest/test-ingest-biomed-path.sh
@ -31,7 +31,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --num-processes 2 \
    --partition-strategy hi_res \
    --reprocess \
-    --download-dir biomed-download-path \
+    --download-dir files-ingest-download/biomed-path \
    --preserve-downloads \
    --verbose

--- a/test_unstructured_ingest/test-ingest-discord.sh
+++ b/test_unstructured_ingest/test-ingest-discord.sh
@ -17,6 +17,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
   --discord-token "$DISCORD_TOKEN" \
   --download-dir discord-ingest-download \
   --structured-output-dir discord-ingest-output \
+   --preserve-downloads \
   --reprocess

 OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
--- a/test_unstructured_ingest/test-ingest-github.sh
+++ b/test_unstructured_ingest/test-ingest-github.sh
@ -31,6 +31,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --structured-output-dir github-downloadify-output \
    --partition-strategy hi_res \
    --reprocess \
+    --download-dir files-ingest-download/github \
    --preserve-downloads \
    --verbose $ACCESS_TOKEN_FLAGS

--- a/test_unstructured_ingest/test-ingest-gitlab.sh
+++ b/test_unstructured_ingest/test-ingest-gitlab.sh
@ -12,6 +12,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --structured-output-dir gitlab-ingest-output \
    --git-branch 'v0.0.7' \
    --partition-strategy hi_res \
+    --download-dir files-ingest-download/gitlab \
+    --preserve-downloads \
    --verbose

 set +e
--- a/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
+++ b/test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# A local connector to process pre-downloaded PDFs under `files-ingest-download` dir with --fast startegy
+
+set -e
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+cd "$SCRIPT_DIR"/.. || exit 1
+
+
+PYTHONPATH=. ./unstructured/ingest/main.py \
+    --metadata-exclude filename,file_directory,metadata.data_source.date_processed \
+    --local-input-path files-ingest-download \
+    --local-recursive \
+    --local-file-glob "*.pdf" \
+    --structured-output-dir pdf-fast-reprocess-ingest-output \
+    --partition-strategy fast \
+    --reprocess \
+    --num-processes 2
+
+OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
+
+set +e
+
+# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
+if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
+
+    cp -a pdf-fast-reprocess-ingest-output/* test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess/
+
+elif ! diff -ru test_unstructured_ingest/expected-structured-output/pdf-fast-reprocess pdf-fast-reprocess-ingest-output ; then
+    echo
+    echo "There are differences from the previously checked-in structured outputs."
+    echo
+    echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
+    echo
+    echo "  export OVERWRITE_FIXTURES=true"
+    echo
+    echo "and then rerun this script."
+    echo
+    echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
+    echo "to update fixtures for CI."
+    echo
+    exit 1
+
+fi
--- a/test_unstructured_ingest/test-ingest-s3.sh
+++ b/test_unstructured_ingest/test-ingest-s3.sh
@ -18,6 +18,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --structured-output-dir s3-small-batch-output \
    --preserve-downloads \
    --partition-strategy hi_res \
+    --download-dir files-ingest-download/s3 \
    --reprocess

 OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
--- a/test_unstructured_ingest/test-ingest-slack.sh
+++ b/test_unstructured_ingest/test-ingest-slack.sh
@ -17,6 +17,7 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
   --download-dir slack-ingest-download \
   --structured-output-dir slack-ingest-output \
   --partition-strategy hi_res \
+   --preserve-downloads \
   --start-date 2023-04-01 \
   --end-date 2023-04-08T12:00:00-08:00 \
   --reprocess
--- a/test_unstructured_ingest/test-ingest-wikipedia.sh
+++ b/test_unstructured_ingest/test-ingest-wikipedia.sh
@ -11,6 +11,8 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
    --structured-output-dir wikipedia-ingest-output \
    --num-processes 2 \
    --partition-strategy hi_res \
+    --download-dir files-ingest-download/wikipedia \
+    --preserve-downloads \
    --verbose

 set +e
--- a/test_unstructured_ingest/test-ingest.sh
+++ b/test_unstructured_ingest/test-ingest.sh
@ -19,3 +19,5 @@ export OMP_THREAD_LIMIT=1
 ./test_unstructured_ingest/test-ingest-local.sh
 ./test_unstructured_ingest/test-ingest-slack.sh
 ./test_unstructured_ingest/test-ingest-against-api.sh
+# NOTE(yuming): The following test should be put after any tests with --preserve-downloads option
+./test_unstructured_ingest/test-ingest-pdf-fast-reprocess.sh
--- a/unstructured/version.py
+++ b/unstructured/version.py
@ -1 +1 @@
-__version__ = "0.7.5-dev0"  # pragma: no cover
+__version__ = "0.7.5-dev1"  # pragma: no cover
--- a/unstructured/partition/pdf.py
+++ b/unstructured/partition/pdf.py
@ -269,6 +269,7 @@ def _process_pdfminer_pages(
        height = page.height

        text_segments = []
+        page_elements = []
        for obj in page:
            x1, y2, x2, y1 = obj.bbox
            y1 = height - y1
@ -286,7 +287,16 @@ def _process_pdfminer_pages(
                element = element_from_text(_text)
                element.coordinates = ((x1, y1), (x1, y2), (x2, y2), (x2, y1))
                element.metadata = metadata
-                elements.append(element)
+                page_elements.append(element)
+
+        sorted_page_elements = sorted(
+            page_elements,
+            key=lambda el: (
+                el.coordinates[0][1] if el.coordinates else float("inf"),
+                el.coordinates[0][0] if el.coordinates else float("inf"),
+            ),
+        )
+        elements += sorted_page_elements

        if include_page_breaks:
            elements.append(PageBreak())