mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2026-01-08 05:10:11 +00:00
This PR aims to add support for link extraction in pdf `hi_res`
strategy. The `partition_pdf()` function now supports link extraction
when using the `hi_res` strategy, allowing users to extract hyperlinks
from PDF documents.
### Summary
- Added functionalities to support link extraction in hi_res flow
- Enhanced word extraction functionality used for link extraction in
both `fast` and `hi_res` flows, resulted in more correct `start_index`
and `text` in `links` metadata.
- Updated ingest fixture update workflow to not skip Astra DB source
test
### Testing
```
elements = partition_pdf(
filename="example-docs/pdf/embedded-link.pdf",
strategy="hi_res"
)
assert len(elements[0].metadata.links) == 3
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
Co-authored-by: cragwolfe <crag@unstructured.io>
148 lines
6.1 KiB
YAML
148 lines
6.1 KiB
YAML
name: Ingest Test Fixtures Update PR
|
|
|
|
on:
|
|
workflow_dispatch:
|
|
|
|
env:
|
|
PYTHON_VERSION: "3.10"
|
|
|
|
permissions:
|
|
id-token: write
|
|
contents: read
|
|
|
|
jobs:
|
|
setup:
|
|
runs-on: ubuntu-latest-m
|
|
env:
|
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- uses: ./.github/actions/base-cache
|
|
with:
|
|
python-version: ${{ env.PYTHON_VERSION }}
|
|
|
|
setup_ingest:
|
|
runs-on: ubuntu-latest
|
|
env:
|
|
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
|
needs: [setup]
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
- uses: ./.github/actions/base-ingest-cache
|
|
with:
|
|
python-version: ${{ env.PYTHON_VERSION }}
|
|
check-only: 'true'
|
|
|
|
update-fixtures-and-pr:
|
|
runs-on: ubuntu-latest-m
|
|
needs: [setup_ingest]
|
|
steps:
|
|
# actions/checkout MUST come before auth
|
|
- uses: 'actions/checkout@v4'
|
|
- name: Set up Python ${{ env.PYTHON_VERSION }}
|
|
uses: actions/setup-python@v5
|
|
with:
|
|
python-version: ${{ env.PYTHON_VERSION }}
|
|
- name: Get full Python version
|
|
id: full-python-version
|
|
run: echo version=$(python -c "import sys; print('-'.join(str(v) for v in sys.version_info))") >> $GITHUB_OUTPUT
|
|
- name: Setup virtual environment
|
|
uses: ./.github/actions/base-ingest-cache
|
|
with:
|
|
python-version: ${{ env.PYTHON_VERSION }}
|
|
- name: Setup docker-compose
|
|
uses: KengoTODA/actions-setup-docker-compose@v1
|
|
with:
|
|
version: '2.22.0'
|
|
- name: Update test fixtures
|
|
env:
|
|
AIRTABLE_PERSONAL_ACCESS_TOKEN: ${{ secrets.AIRTABLE_PERSONAL_ACCESS_TOKEN }}
|
|
BOX_APP_CONFIG: ${{ secrets.BOX_APP_CONFIG }}
|
|
CONFLUENCE_API_TOKEN: ${{ secrets.CONFLUENCE_API_TOKEN }}
|
|
CONFLUENCE_USER_EMAIL: ${{ secrets.CONFLUENCE_USER_EMAIL }}
|
|
DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }}
|
|
DROPBOX_APP_KEY: ${{ secrets.DROPBOX_APP_KEY }}
|
|
DROPBOX_APP_SECRET: ${{ secrets.DROPBOX_APP_SECRET }}
|
|
DROPBOX_REFRESH_TOKEN: ${{ secrets.DROPBOX_REFRESH_TOKEN }}
|
|
GCP_INGEST_SERVICE_KEY: ${{ secrets.GCP_INGEST_SERVICE_KEY }}
|
|
GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }}
|
|
HUBSPOT_API_TOKEN: ${{ secrets.HUBSPOT_API_TOKEN }}
|
|
JIRA_INGEST_API_TOKEN: ${{ secrets.JIRA_INGEST_API_TOKEN }}
|
|
JIRA_INGEST_USER_EMAIL: ${{ secrets.JIRA_INGEST_USER_EMAIL }}
|
|
MONGODB_URI: ${{ secrets.MONGODB_URI }}
|
|
MONGODB_DATABASE_NAME: ${{ secrets.MONGODB_DATABASE_NAME }}
|
|
MS_CLIENT_CRED: ${{ secrets.MS_CLIENT_CRED }}
|
|
MS_CLIENT_ID: ${{ secrets.MS_CLIENT_ID }}
|
|
MS_TENANT_ID: ${{ secrets.MS_TENANT_ID }}
|
|
MS_USER_EMAIL: ${{ secrets.MS_USER_EMAIL }}
|
|
MS_USER_PNAME: ${{ secrets.MS_USER_PNAME }}
|
|
SALESFORCE_USERNAME: ${{secrets.SALESFORCE_USERNAME}}
|
|
SALESFORCE_CONSUMER_KEY: ${{secrets.SALESFORCE_CONSUMER_KEY}}
|
|
SALESFORCE_PRIVATE_KEY: ${{secrets.SALESFORCE_PRIVATE_KEY}}
|
|
SHAREPOINT_CLIENT_ID: ${{secrets.SHAREPOINT_CLIENT_ID}}
|
|
SHAREPOINT_CRED: ${{secrets.SHAREPOINT_CRED}}
|
|
SHAREPOINT_SITE: ${{secrets.SHAREPOINT_SITE}}
|
|
SHAREPOINT_PERMISSIONS_APP_ID: ${{secrets.SHAREPOINT_PERMISSIONS_APP_ID}}
|
|
SHAREPOINT_PERMISSIONS_APP_CRED: ${{secrets.SHAREPOINT_PERMISSIONS_APP_CRED}}
|
|
SHAREPOINT_PERMISSIONS_TENANT: ${{secrets.SHAREPOINT_PERMISSIONS_TENANT}}
|
|
SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
|
|
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
|
NOTION_API_KEY: ${{ secrets.NOTION_API_KEY }}
|
|
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
|
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
|
AZURE_SEARCH_ENDPOINT: ${{ secrets.AZURE_SEARCH_ENDPOINT }}
|
|
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
|
|
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }}
|
|
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
|
|
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
|
|
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
|
|
OVERWRITE_FIXTURES: "true"
|
|
CI: "true"
|
|
run: |
|
|
source .venv/bin/activate
|
|
sudo apt-get update
|
|
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
|
make install-pandoc
|
|
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
|
sudo apt-get install -y tesseract-ocr
|
|
sudo apt-get install -y tesseract-ocr-kor
|
|
sudo apt-get install diffstat
|
|
tesseract --version
|
|
./test_unstructured_ingest/test-ingest-src.sh
|
|
|
|
- name: Save branch name to environment file
|
|
id: branch
|
|
run: |
|
|
original_branch=$(git rev-parse --abbrev-ref HEAD)
|
|
suffix="|ingest-test-fixtures-update-$(git rev-parse --short HEAD)"
|
|
branch_name="$original_branch$suffix"
|
|
echo "BRANCH_NAME=$branch_name" >> $GITHUB_ENV
|
|
|
|
- name: Save PR name to environment file
|
|
id: pr
|
|
run: |
|
|
commit_sha=$(git rev-parse HEAD)
|
|
prs=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
|
|
"https://api.github.com/repos/${{ github.repository }}/commits/${commit_sha}/pulls")
|
|
pr_name=$(echo "$prs" | jq -r '.[0].title')
|
|
echo "PR_NAME=$pr_name" >> $GITHUB_ENV
|
|
|
|
- name: Create Pull Request
|
|
uses: peter-evans/create-pull-request@v5
|
|
with:
|
|
token: ${{ secrets.GH_CREATE_PR_TOKEN }}
|
|
add-paths: |
|
|
test_unstructured_ingest/expected-structured-output
|
|
test_unstructured_ingest/metrics
|
|
commit-message: "Update ingest test fixtures"
|
|
branch: ${{ env.BRANCH_NAME }}
|
|
title: "${{ env.PR_NAME }} <- Ingest test fixtures update"
|
|
assignees: ${{ github.actor }}
|
|
reviewers: ${{ github.actor }}
|
|
delete-branch: true
|
|
body: |
|
|
This pull request includes updated ingest test fixtures.
|
|
Please review and merge if appropriate.
|
|
base: ${{ github.head_ref }}
|