build: script to update all ingest fixtures, add azure ingest fixtures (#367)

- Updates CI to install tesseract version 5.3.0 (better than 4.x in various ways incl. perf.).
- Adds azure expected output fixtures for more useful reference points and as a repro for Some PDF's with scanned images return empty elements #346 .
- Adds a script to regenerate ingest test fixtures that is run in an ubuntu docker container (like CI), with the same version of tesseract. See the comments in scripts/ingest-test-fixtures-update.sh for details.
- Updates expected outputs with above script.
- Updates individual test-ingest scripts to update expected .json output if OVERWRITE_FIXTURES=true.
This commit is contained in:
cragwolfe 2023-04-11 00:11:50 -07:00 committed by GitHub
parent 7ec85272b7
commit 7b44bcd6e0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 5113 additions and 102 deletions

View File

@ -105,7 +105,10 @@ jobs:
source .venv/bin/activate
make install-detectron2
sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice pandoc
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr
tesseract --version
make test
make check-coverage
make install-ingest-s3

View File

@ -26,12 +26,12 @@ install-ci: install-base-pip-packages install-nltk-models install-huggingface in
.PHONY: install-base-pip-packages
install-base-pip-packages:
python3 -m pip install pip==${PIP_VERSION}
pip install -r requirements/base.txt
python3 -m pip install -r requirements/base.txt
.PHONY: install-huggingface
install-huggingface:
python3 -m pip install pip==${PIP_VERSION}
pip install -r requirements/huggingface.txt
python3 -m pip install -r requirements/huggingface.txt
.PHONE: install-nltk-models
install-nltk-models:
@ -40,52 +40,52 @@ install-nltk-models:
.PHONY: install-test
install-test:
pip install -r requirements/test.txt
python3 -m pip install -r requirements/test.txt
.PHONY: install-dev
install-dev:
pip install -r requirements/dev.txt
python3 -m pip install -r requirements/dev.txt
.PHONY: install-build
install-build:
pip install -r requirements/build.txt
python3 -m pip install -r requirements/build.txt
.PHONY: install-ingest-google-drive
install-ingest-google-drive:
pip install -r requirements/ingest-google-drive.txt
python3 -m pip install -r requirements/ingest-google-drive.txt
## install-ingest-s3: install requirements for the s3 connector
.PHONY: install-ingest-s3
install-ingest-s3:
pip install -r requirements/ingest-s3.txt
python3 -m pip install -r requirements/ingest-s3.txt
.PHONY: install-ingest-azure
install-ingest-azure:
pip install -r requirements/ingest-azure.txt
python3 -m pip install -r requirements/ingest-azure.txt
.PHONY: install-ingest-github
install-ingest-github:
pip install -r requirements/ingest-github.txt
python3 -m pip install -r requirements/ingest-github.txt
.PHONY: install-ingest-gitlab
install-ingest-gitlab:
pip install -r requirements/ingest-gitlab.txt
python3 -m pip install -r requirements/ingest-gitlab.txt
.PHONY: install-ingest-reddit
install-ingest-reddit:
pip install -r requirements/ingest-reddit.txt
python3 -m pip install -r requirements/ingest-reddit.txt
.PHONY: install-ingest-wikipedia
install-ingest-wikipedia:
pip install -r requirements/ingest-wikipedia.txt
python3 -m pip install -r requirements/ingest-wikipedia.txt
.PHONY: install-unstructured-inference
install-unstructured-inference:
pip install -r requirements/local-inference.txt
python3 -m pip install -r requirements/local-inference.txt
.PHONY: install-detectron2
install-detectron2:
pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc#egg=detectron2"
python3 -m pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc#egg=detectron2"
## install-local-inference: installs requirements for local inference
.PHONY: install-local-inference

View File

@ -0,0 +1,24 @@
# Dockerfile that approximates the CI image
#
# Mainly useful for updating test-ingest fixtures
FROM ubuntu:22.04
COPY scripts/setup_ubuntu.sh scripts/setup_ubuntu.sh
RUN bash scripts/setup_ubuntu.sh root
COPY requirements/ requirements/
COPY Makefile Makefile
SHELL ["/bin/bash", "-c"]
RUN source ~/.bashrc && pyenv virtualenv 3.8.15 unstructured && \
source ~/.pyenv/versions/unstructured/bin/activate && \
make install-ci && \
make install-detectron2 && \
make install-ingest-s3 && \
make install-ingest-azure && \
make install-ingest-github && \
make install-ingest-gitlab && \
make install-ingest-wikipedia

11
scripts/docker-build-ubuntu.sh Executable file
View File

@ -0,0 +1,11 @@
#!/usr/bin/env bash
# Mainly useful for building an image from which to update test-ingest fixtures
set -eu -o pipefail
# Change to the root of the repository
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1
docker build -t unstructured-ubuntu:latest --progress plain -f docker/ubuntu-22/Dockerfile .

View File

@ -0,0 +1,49 @@
#!/usr/bin/env bash
# Structured .json output from PDF's or images may differ subtly (or not so subtly)
# based on the version of tesseract, its dependencies, and chip architecture.
#
# To update ingest-test expected outputs (structured .json files), this script:
# * builds an ubuntu image that
# * matches CI with respect to tesseract and OS deps
# * installs python dependencies from the local requirements/ directory
# * runs each test ingest script with OVERWRITE_FIXTURES=true
# * so updates are written to test_unstructured_ingest/expected-structured-output/
# * using local unstructured/ directory (i.e. from local git branch)
#
# It is recommended to run this script on x86_64 hardware.
set -eu -o pipefail
# Change to the root of the repository
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1
ARCHITECTURE=$(uname -m)
if [ "$ARCHITECTURE" != "x86_64" ]; then
echo "Warning: This script is designed to run on x86_64 hardware, but you're running on $ARCHITECTURE."
fi
./scripts/docker-build-ubuntu.sh
# Warn the user if they have an old image
IMAGE_NAME="unstructured-ubuntu:latest"
CREATION_TIMESTAMP=$(docker inspect --format='{{.Created}}' "$IMAGE_NAME")
CREATION_DATE=$(date -d "$CREATION_TIMESTAMP" +%s)
CURRENT_DATE=$(date +%s)
AGE_DAYS=$(( (CURRENT_DATE - CREATION_DATE) / 86400 ))
if [ "$AGE_DAYS" -gt 6 ]; then
echo "WARNING: The image \"$IMAGE_NAME\" is more than 7 days old ($AGE_DAYS days)."
echo "You may want to 'docker rmi $IMAGE_NAME' and rerun this script if it is not current."
fi
docker run --rm -v "$SCRIPT_DIR"/../unstructured:/root/unstructured -v \
"$SCRIPT_DIR"/../test_unstructured_ingest:/root/test_unstructured_ingest \
-w /root "$IMAGE_NAME" \
bash -c "export OVERWRITE_FIXTURES=true && source ~/.bashrc && pyenv activate unstructured &&
./test_unstructured_ingest/test-ingest-azure.sh &&
./test_unstructured_ingest/test-ingest-github.sh &&
./test_unstructured_ingest/test-ingest-biomed-api.sh &&
./test_unstructured_ingest/test-ingest-biomed-path.sh &&
./test_unstructured_ingest/test-ingest-s3.sh"

View File

@ -37,7 +37,7 @@ $sudo $pac upgrade -y
#### Utils
# Prerequisites
$sudo env DEBIAN_FRONTEND="noninteractive" $pac install -y gcc wget tar curl make xz-utils build-essential tzdata
$sudo env DEBIAN_FRONTEND="noninteractive" $pac install -y gcc wget tar curl make xz-utils build-essential tzdata rsync
#### Git
# Install git
@ -82,9 +82,13 @@ $sudo $pac install -y libgl1
# Install poppler
$sudo $pac install -y poppler-utils
#### Tesseract
# Install tesseract as well as Russian language
$sudo $pac install -y tesseract-ocr libtesseract-dev tesseract-ocr-rus libreoffice pandoc
#### OpenOffice / MSOffice doc conversion capabilities
$sudo $pac install -y libreoffice pandoc
# Install tesseract 5 as well as Russian language
$sudo $pac install -y software-properties-common
$sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
$sudo $pac install -y tesseract-ocr libtesseract-dev tesseract-ocr-rus
#### libmagic
$sudo $pac install -y libmagic-dev

View File

@ -0,0 +1,378 @@
[
{
"element_id": "833e3f9d4af02845c670c31e2d6d4f9a",
"text": "Skills for Biomedical Data",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "64b2134f054446d473fce1b05d4d4c94",
"text": "Maryam Zaringhalam, PhD, AAAS Science & Technology Policy Fellow",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "f3416e4bccede2117fed6bc61910bc18",
"text": "F. Huerta, PhD, Associate Director of NLM for Program Development and NLM of Data Science and Open Science Initiatives",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "f14031943b3f1e34dcfc27bf02c38c09",
"text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs. Suggested high-level core skills include:",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "1f0a7c85704bf89e1ec17d6fe40bf29b",
"text": "General biomedical subject matter knowledge: biomedical data scientists have a general working knowledge of the principles of biology, bioinformatics, basic clinical science;",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "8a5926da311fdb0da8c4cac8e15ba79d",
"text": "Programming language expertise: biomedical data scientists should be fluent in least one programming language (typically R and/or Python);",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "fdc7dc13c15e758445efae7d34a23951",
"text": "Predictive analytics, modeling, and machine learning: while a range of methods may be useful, predictive analytics, modeling, and machine learning as especially important skills in biomedical data science;",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "52c66df43382ed215c2445f81ad76010",
"text": "Team science and scientific communication: “soft” skills, like the ability to work on teams and communicate effectively in both verbal and written venues, may be important as the more technical skills typically associated with data science.",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "5a9a1f072c06a3ae844a187dab3b9e32",
"text": "Responsible data stewardship: a successful data scientist must be able to best practices for data management and stewardship, as well as conduct research an ethical manner that maintains data security and privacy.",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "5e3d4670749a0f3753fa4bb1b328d156",
"text": "Training a biomedical data science (BDS) workforce is a central theme in NLMs Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "4c5f925a7db08289f19dbe8635d8b4cd",
"text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce.",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "f26d07e6b71e42596791a241e2417931",
"text": "Methodology",
"type": "Title",
"metadata": {
"page_number": 2
}
},
{
"element_id": "153010aa2c8aa0a0e54bdac5e14340be",
"text": "a) Responses to a",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "9b773eceddf8b7622fdec8bb3c8657ff",
"text": "Kaggle",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "811f2a14b9850c9d9c7562f29228754b",
"text": "survey",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "13d71f8611c0248d58ffa4d1230da73e",
"text": "of over",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "d03502c43d74a30b936740a9517dc4ea",
"text": ",",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "b9155b5adc04d5539379f3fc62b33711",
"text": "self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use. b) Data science skills taught in BD",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "07f54d1cb2e96bc062c55121de3f6882",
"text": "K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "24349c8054862cb8cbd4d857d096943e",
"text": "BD",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "caa5fc58a6d57578858155571d5d4f79",
"text": "K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "8b67c1eff9f0e59b2d8a11195bc13ce1",
"text": ") statistics and math skills; (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "4b021f7187c84f22e863e931047e2fc2",
"text": ") computer science; (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "59e087d0c9fcb1a8cc6d5448ce5fad04",
"text": ") subject knowledge; (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "ae19ecd18e97da5a942738ed9c37b235",
"text": ") general skills, like communication and teamwork. The coding schema is detailed in Appendix A. c) Desired skills identified from data science-related job ads.",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "d14cf7578b76bba89cd14f7c65d27dce",
"text": "job ads from government (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "ff8f02c33b45fd488b21342ad816f985",
"text": "%), academia (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "7b953c4510d51c8c49bdb1f72208e813",
"text": "%), industry (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "b991274d798760827347db84d4c50aed",
"text": "%), and the nonprofit sector (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "d6dc0d0c11a894b2ce64fcc8af4cfe27",
"text": "%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "a119d3cc0dc7f3ac85725acf60229415",
"text": "Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "153010aa2c8aa0a0e54bdac5e14340be",
"text": "a) Responses to a",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "13d71f8611c0248d58ffa4d1230da73e",
"text": "of over",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "d03502c43d74a30b936740a9517dc4ea",
"text": ",",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "86bb1e8982150210536b8273bbe0b53d",
"text": "self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use.",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "73b8242ab49aacecd5561fc18ea23239",
"text": "b) Data science skills taught in BD",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "07f54d1cb2e96bc062c55121de3f6882",
"text": "K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "24349c8054862cb8cbd4d857d096943e",
"text": "BD",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "caa5fc58a6d57578858155571d5d4f79",
"text": "K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "8b67c1eff9f0e59b2d8a11195bc13ce1",
"text": ") statistics and math skills; (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "4b021f7187c84f22e863e931047e2fc2",
"text": ") computer science; (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "59e087d0c9fcb1a8cc6d5448ce5fad04",
"text": ") subject knowledge; (",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "c865029d7025ef68891ec5c426b9aaa3",
"text": ") general skills, like communication and teamwork. The coding schema is detailed in Appendix A.",
"type": "ListItem",
"metadata": {
"page_number": 2
}
},
{
"element_id": "63ba3341ed94f1b2d89198e84757f871",
"text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "52eeeeac5a03bf69edb6126abb21f1d5",
"text": "Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad.",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "6a436d034b4636ebebdfee2765d3ac9e",
"text": "Analysis of the above data provided insights into the current state of biomedical data training, as well as a view into data science-related skills likely to be needed to prepare BDS workforce to succeed in the future. Together, these analyses informed for core skills necessary for a competitive biomedical data scientist.",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "94ba2c5be803a3cb405fc51dada2532d",
"text": "2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
}
]

View File

@ -0,0 +1,322 @@
[
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "NarrativeText",
"metadata": {
"page_number": 2
}
},
{
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "Title",
"metadata": {
"page_number": 2
}
}
]

View File

@ -0,0 +1,322 @@
[
{
"element_id": "0bd6458cb49a638f3ccff515b9433cb8",
"text": "———eee eee eee\n\nInstructions for Form 3115\n(Rev. November 1987)\n\nAnniicatinn far Chancain Acnninting Mothad\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "41f3d9c83b2b4679195c9796134fd8f5",
"text": "(Section references are to the Internal Revenue Code unless otherwise noted.)\n",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "97968e4ba14bd2d082a70ec61ef2d9b1",
"text": "Long-term contracts.—If you are required to\nchange your method of accounting for long-term\ncontracts under section",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "ac843848ae2f4c656203dee90cdc207c",
"text": ", see Notice",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "3973e022e93220f9212c18d0d0c543ae",
"text": "-",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca",
"text": "(",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "8a5edab282632443219e051e4ade2d1d",
"text": "/",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "8a5edab282632443219e051e4ade2d1d",
"text": "/",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "bb01c44bd646ab29df9cea6459a3499b",
"text": "),",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "3973e022e93220f9212c18d0d0c543ae",
"text": "-",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "29b33c1e0aea8247e6576bd9ad14448e",
"text": "IRB",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "f0d2beb7f43493694a91137e8e65b5f3",
"text": ", for the notification\nprocedures that must be followed.\n\nOther methods. —Unless the Service has\npublished a regulation or procedure to the\ncontrary, all other changes in accounting\nmethods required by the Act are automatically\nconsidered to be approved by the Commissioner.\nExamples of method changes automatically\napproved by the Commissioner are those changes\nrequired to effect: (",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "13f2a282f705590fbe7b6ce15b08862a",
"text": ") the repeal of the reserve\nmethod for bad debts of taxpayers other than\nfinancial institutions (Act section",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "fd0f38844b9901d3a4e7c44630346145",
"text": "); (",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "9820f79275e683f5afe3f2f1283de4ca",
"text": ") the\nrepeal of the installment method for sales under\na revolving credit plan (Act section",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "fd0f38844b9901d3a4e7c44630346145",
"text": "); (",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "a98378f4a88db65dff42b7d8bd75be92",
"text": ") the\nInclusion of mcome attributable to the sale or\nfurnishing of utility services no later than the year\nin which the services were provided to customers\n(Act section",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "25d6eaf57eebce49267b71ce2f347a03",
"text": "); and (",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "3cb57c50002187a715e1c5048e643c65",
"text": ") the repeal of the\ndeduction for qualified discount coupons (Act\nsection",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e9d9ab5eb5ff32a31a32bda940a33b7a",
"text": "). Do not file Form",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "f88cf27baa9e77b38c7d9c688ac90417",
"text": "for these\nchanges.\n\nTime and Dinne fay Cling",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "bd913e19b877497b5480c528c96fd0f6",
"text": "Signature\n\nIndivideale\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "0c97452e61a431a9bced8091af69e908",
"text": "Individuals.—An individual desiring the change\nshould sign the application. Ifthe application\npertains to a husband and wife filing a joint\nIncome tax return, the names of both should\nappear in the heading and both should sign\nPartnerships.—The form should be signed with\nthe partnership name followed by the signature\nof one of the general partners and the words\n“General Partner.”\nCorporations, cooperatives, and insurance\ncompanies.—The form should show the name of\nthe corporation, cooperative, or insurance\nCompany and the signature of the president, vice\npresident, treasurer, assistant treasurer, or chief\naccounting officer (such as tax officer) authorized\ntosign, and his or her official title. Receivers,\ntrustees, or assignees must sign any application\nthey are required to file, For a subsidiary\ncorporation filing a consolidated return with its\nparent, the form should be signed by an officer of\nthe parent corporation,\nluciaries.—The-form should show the name\nof the estate or trust and be signed by the\nfiduciary, personal representative, executor,\nexecutrix, administrator, administratrx, etc,\nhaving legal authority to'sign, and his or her ttle.\nPreparer other than partner, officer, etc.—The\nsignature of the individual preparing the\napplication should appear in the space provided\non page",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "586e989b479e4362ebe28a6954c1427b",
"text": "If the individual or firm is also authorized to",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "226fa83297914d5195e002508d61fb1d",
"text": "General Instructions\n\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "d0e1e01dcbc7b4dfa2df8fe1d7c71acc",
"text": "General Instructions\nPurpose of Form\n\nPile thse Seen te vepsect a phepee se\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "f0e951e5bcb4a6070fa6672b37822348",
"text": "Purpose of Form\n\nCin bce Secon te cece cget.\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "03fe77cbc1e2a87cdf64a64b839545b5",
"text": "alata\nGenerally, applicants must complete Section\n\nA. In addition, complete the appropriate sections\n\n(B:1 through H) for which a change is desired.\n\nYou must give all relevant facts. including a\n\n",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "7fc74bd7792c99bb71777aeaea5bf987",
"text": "Time and Place for Filing\namacall, ammlinapte pret file trie\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "efd2dea48b678ae3052a8fae284dcd9b",
"text": "on page ©.\n\nIf the individual or firm is also authorized to\nrepresent the applicant before the IRS, receive\na copy of the requested ruling, or perform any\nother act(s), the power of attorney must reflect\nsuch authorization(s).\n\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "8b35e7c212710b1099b675ce9394fb47",
"text": "Se NB ON\n\nState whether you desire a conference in the\nNational Office if the Service proposes to\ndisapprove your application.\n\n",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "7c9e868b449a25434af63386e8c72962",
"text": "Affiliated Groups\n\nTavmayare that ara mam)\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "d6d128db1d06743816667d277159e1e9",
"text": "Changes to Accounting Methods\nRequired Under the Tax Reform Act\nof 1986\n\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "4f022ad16f9de29b399fe4e77ebec3da",
"text": "Uniform capitalization rules and limitation on\ncash method. —If you are required to change\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "231967b6e23633ce4b794ba4d92195b5",
"text": "Specific Instructions\nSection A\n\nItem Sa. nage 1 «-\"Taxahle incams\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "1dda7db8eaa236f190c9f1385666af36",
"text": "anearly application.\nNote: if this form is being filed in accordance\nwith Rev. Proc. 74-11, see Section G below.\n\na.\n\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "b4a7f10875d4301b0cbce5eff69f64df",
"text": "Late Applications\n\nMe cms anmiimation ie Fler\n",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e4a97fbdd3d6f33335ec71deba7af01f",
"text": "includes total sales (net of returns and\nallowances) and all amounts received for\nservices. in addition, gross receipts include any\nincome from investments and from incidental or\noutside sources (e.g., interest, dividends, rents,\nroyalties, and annuities). However, if you area\nresaler of personal property, exclude from gross\nreceipts any amounts not derived in the ordinary\ncourse of a trade or business. Gross receipts do\nnot include amounts received for sales taxes if,\ntunder the applicable state or local law, the taxis\nlegally imposed on the purchaser of the good or\nservice, and the taxpayer merely collects and\nremits the tax to the taxing authority.\n",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "786c2aaee9fcae020f4b01a298e4d141",
"text": "Disregard the instructions under Time and\nPlace for Filing and Late Applications. instead,\nattach Form 3115 to your income tax return for\nthe year of change; do not file it separately. Also\ninclude on a separate statement accompanying\nthe Form 3115 the period over which the section\n481(2) adjustment will be taken into account and\nthe basis for that conclusion. Identify the\n\n",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "f2db523f6d52de1e67f6e8c1c81a8069",
"text": "Identifying Number\n",
"type": "Title",
"metadata": {
"page_number": 1
}
}
]

View File

@ -0,0 +1,226 @@
[
{
"element_id": "41f6e17bf5e9a407fcca74e902f802a0",
"text": "News Around NOAA",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "aa589c25dc22dcc8a75baba1244e6c8f",
"text": "National Program",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "62c26d2e16774d2334bd804c7bb6a711",
"text": "Are You Weather-Ready for the Spring?",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "32709cd3bec72640bbbe32f58e6e23f6",
"text": "Weather.gov >",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "2661da76db570876b075083aaeeaee55",
"text": "News Around NOAA > Are You Weather-Ready for the Spring?",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "fab6c4df083f0fb6f324fff65b652c86",
"text": "Weather Safety Air Quality Beach Hazards Cold Cold Water Drought Floods Fog Heat Hurricanes Lightning Safety Rip Currents Safe Boating Space Weather Sun (Ultraviolet Radiation) Thunderstorms & Tornadoes Tornado Tsunami Wildfire Wind Winter",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "45c26cf3457e6d18985a435e2c0fcc65",
"text": "Safety Campaigns Seasonal Safety Campaigns #SafePlaceSelfie Deaf & Hard of Hearing Intellectual Disabilities Spanish-language Content The Great Outdoors",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "77f5acc603de9a165ed87a5c3fbaf14a",
"text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Español",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "8f19bcaabbd1bafa5e9826ac69766c8b",
"text": "Education NWS Education Home Be A Force Of Nature WRN Kids Flyer Wireless Emergency Alerts NOAA Weather Radio Mobile Weather Brochures Hourly Weather Forecast Citizen Science Intellectual Disabilities",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "1245f9cf9e019713391e4ee3bac54a63",
"text": "Collaboration Get Involved Social Media WRN Ambassadors Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only) SKYWARN",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "23dfa7f98424dbf86e00b3d500096dfa",
"text": "News & Events Latest News Calendar Meetings & Workshops NWS Aware Newsletter",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "93202df2ec7081b28b47901b5c287a5a",
"text": "International",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "e53d6a9c615bdf1a8d7b98a67cade488",
"text": "About Contact Us What is WRN? WRN FAQ WRN Brochure Hazard Simplification IDSS Brochure Roadmap Strategic Plan WRN International Social Science",
"type": "ListItem",
"metadata": {
"page_number": 1
}
},
{
"element_id": "6cbcf8c11f8c0781bd9ecc7f67169ff0",
"text": "The spring season is all about change a rebirth both literally and figuratively. Even though the spring season doesnt officially (astronomically, that is) begin until March 20 this year, climatologically, it starts March 1.",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "7184168da442c6ef28553b274bf2be8f",
"text": "As cold winter nights are replaced by the warmth of longer daylight hours, the National Weather Service invites you to do two important things that may save your life or the life of a loved one.",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "f3be9748ecd68b20d706548129baa22d",
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "126c3cd201fb259cfeabc6bffc0b5473",
"text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic.",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "c1944fb037f3e1cb14969bc59a7dd9c2",
"text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in springs moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available.",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "fa1b939ef6159d95260bc095f58ebbc2",
"text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready.",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "47d5d0d27a35a36d7467dfc8b6e089b3",
"text": "US Dept of Commerce\n National Oceanic and Atmospheric Administration\n National Weather Service\n News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us.",
"type": "NarrativeText",
"metadata": {
"page_number": 1
}
},
{
"element_id": "129c678fce59acee7ac6a6fdb67b6310",
"text": "Disclaimer",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "3c96caaebd949e39d25b3ccf4133c5d8",
"text": "Information Quality",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "b79cac926e0b2e347e72cc91d5174037",
"text": "Help",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "4c4e436f9a453c776dbf011f98d932d6",
"text": "Glossary",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "506ff394621596dd88138642eddfc1e4",
"text": "Privacy Policy",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "c70ae8c30a61c450d2c5148d1b6a0447",
"text": "Freedom of Information Act (FOIA)",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "5d8c71abc527284cd463aa58f3f48098",
"text": "About Us",
"type": "Title",
"metadata": {
"page_number": 1
}
},
{
"element_id": "a8a00c355d2fa1461d532a1088274f32",
"text": "Career Opportunities",
"type": "Title",
"metadata": {
"page_number": 1
}
}
]

View File

@ -48,8 +48,8 @@
}
},
{
"element_id": "4bf1c98e38b4b85a7bb1bedb76383117",
"text": "(http: ||creativecommons. org/licenses/by- nc- -nd/4, 0/). ",
"element_id": "3ec82e9a5e3b39e7710305fd9be4e8d5",
"text": "(http: ||creativecommons. org/licenses/by- nc- -nd/4, 0/).",
"type": "NarrativeText",
"metadata": {
"page_number": 1
@ -288,8 +288,8 @@
}
},
{
"element_id": "335719aca05b1cfa324181802cd3f003",
"text": "Eemoos—{1So *2055 . —é —\"15 — Control2 — 8250.0000001 0.00001 0.001 o1Current Density (A/cm2)",
"element_id": "07e8e4f9666bde08d71c1617f69eddd1",
"text": "Potential (Vv)nm°in°}ary=ES 724250.0000001T T T0.00001 0.001 olCurrent Density (A/cm2)",
"type": "FigureCaption",
"metadata": {
"page_number": 4
@ -344,8 +344,8 @@
}
},
{
"element_id": "1da08457018fe7f4244669a3023cfc2f",
"text": "oe ae TRE OaEmcee Det: DOE eee ",
"element_id": "6959a323ee23c858c3b1411b05db6ebf",
"text": "SEM HV: Q0KY WD: 14.89 rmrmDEM MAO: 209 x Det: DOE Pecforsence In nenospact",
"type": "FigureCaption",
"metadata": {
"page_number": 5
@ -360,8 +360,8 @@
}
},
{
"element_id": "73504a9f03b3be882a3ecd5b862079af",
"text": "Fol ieadLSpena ",
"element_id": "b0a40261108ea21c6136d3172b4cd987",
"text": "gEOOfeSem ny. 200 Rv",
"type": "FigureCaption",
"metadata": {
"page_number": 5
@ -376,8 +376,8 @@
}
},
{
"element_id": "c9547666563138eb19ca35a3b250a190",
"text": "atSEM HY: 20.0KU ",
"element_id": "bb1b80b1cdf7f88847e1c8231fb4aae7",
"text": "aSEM HY: 20.0KV",
"type": "FigureCaption",
"metadata": {
"page_number": 5
@ -408,8 +408,8 @@
}
},
{
"element_id": "73e781ff9b3df24b670a347e44da068f",
"text": " ouH,;COCHNY OHOH",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 6

View File

@ -48,8 +48,8 @@
}
},
{
"element_id": "958d8b4aff103d20c38caf156a80c238",
"text": " (http: ||creativecommons. org/licenses/by- nce-nd/4.0/). ",
"element_id": "b16a14378c7e3641edaab4832d548e08",
"text": "(http: ||creativecommons. org/licenses/by- nce-nd/4.0/).",
"type": "NarrativeText",
"metadata": {
"page_number": 1

View File

@ -8,8 +8,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 1
@ -112,8 +112,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 3
@ -192,8 +192,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 5
@ -216,24 +216,24 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "b42bc8a6e8c708b898dc318090243df5",
"text": ": ¥ A4 : ¢@Nyy4 4LANIK¥||SW",
"type": "FigureCaption",
"metadata": {
"page_number": 5
}
},
{
"element_id": "955571c35ce6527872230bf53595aa9e",
"text": "eo re ",
"element_id": "06a308d0660e39112e3611ca071fc163",
"text": "i ee ee",
"type": "FigureCaption",
"metadata": {
"page_number": 5
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 5
@ -288,8 +288,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 6
@ -344,8 +344,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 7
@ -432,8 +432,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 9
@ -472,8 +472,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 10

View File

@ -8,8 +8,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 1
@ -72,8 +72,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 3
@ -272,8 +272,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 6
@ -376,8 +376,8 @@
}
},
{
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
"text": " ",
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
"text": "",
"type": "FigureCaption",
"metadata": {
"page_number": 8
@ -392,8 +392,8 @@
}
},
{
"element_id": "28285134b5841ec7151f35f18434efe9",
"text": "AY ny ",
"element_id": "bd051434f4157e51fd1185e80bd847f8",
"text": "AY nO",
"type": "FigureCaption",
"metadata": {
"page_number": 9
@ -424,8 +424,8 @@
}
},
{
"element_id": "4ad4b4c50d891d618b06f85c6b398770",
"text": " y[he ere AE,BEISSS ",
"element_id": "533260e4d7afb457efe61c53a93718bf",
"text": "y[hee AESUROa er",
"type": "FigureCaption",
"metadata": {
"page_number": 9

View File

@ -8,10 +8,30 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--remote-url abfs://container1/ \
--azure-account-name azureunstructured1 \
--structured-output-dir azure-ingest-output \
--reprocess \
--num-processes 2
if [ "$(find 'azure-ingest-output' -type f -printf '.' | wc -c)" -ne 5 ]; then
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
cp azure-ingest-output/* test_unstructured_ingest/expected-structured-output/azure-blob-storage/
elif ! diff -ru test_unstructured_ingest/expected-structured-output/azure-blob-storage azure-ingest-output ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
echo
echo " export OVERWRITE_FIXTURES=true"
echo
echo "and then rerun this script."
echo
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
echo "to update fixtures for CI."
echo
echo "5 files should have been created."
exit 1
fi

View File

@ -15,18 +15,31 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--biomed-api-until "2019-01-02+00:03:10" \
--structured-output-dir biomed-ingest-output-api \
--num-processes 2 \
--reprocess \
--verbose \
--download-dir biomed-download-api \
--preserve-downloads
if ! diff -ru biomed-ingest-output-api test_unstructured_ingest/expected-structured-output/biomed-ingest-output-api ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, copy the outputs from"
echo "biomed-ingest-output-api/ to test_unstructured_ingest/expected-structured-output/biomed-ingest-output-api/ after running"
echo
echo "PYTHONPATH=. ./unstructured/ingest/main.py --biomed-api-from '2019-01-02' --biomed-api-until '2019-01-02+00:03:10' --structured-output-dir biomed-ingest-output-api --num-processes 2 --verbose --download-dir biomed-download-api --preserve-downloads"
echo
exit 1
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
OWNER_GROUP=$(stat -c "%u:%g" test_unstructured_ingest/expected-structured-output/biomed-ingest-output-api)
rsync -rv --chown="$OWNER_GROUP" biomed-ingest-output-api/ test_unstructured_ingest/expected-structured-output/biomed-ingest-output-api
elif ! diff -ru biomed-ingest-output-api test_unstructured_ingest/expected-structured-output/biomed-ingest-output-api ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
echo
echo " export OVERWRITE_FIXTURES=true"
echo
echo "and then rerun this script."
echo
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
echo "to update fixtures for CI."
echo
exit 1
fi

View File

@ -19,14 +19,26 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--preserve-downloads
if ! diff -ru biomed-ingest-output-path test_unstructured_ingest/expected-structured-output/biomed-ingest-output-path ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, copy the outputs from"
echo "biomed-ingest-output-path/ to test_unstructured_ingest/expected-structured-output/biomed-ingest-output-path/ after running"
echo
echo "PYTHONPATH=. ./unstructured/ingest/main.py --biomed-path 'oa_pdf/07/07/sbaa031.073.PMC7234218.pdf' --structured-output-dir biomed-ingest-output-path --num-processes 2 --verbose --download-dir biomed-download-path --preserve-downloads"
echo
exit 1
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
OWNER_GROUP=$(stat -c "%u:%g" test_unstructured_ingest/expected-structured-output/biomed-ingest-output-path)
rsync -rv --chown="$OWNER_GROUP" biomed-ingest-output-path/ test_unstructured_ingest/expected-structured-output/biomed-ingest-output-path
elif ! diff -ru biomed-ingest-output-path test_unstructured_ingest/expected-structured-output/biomed-ingest-output-path ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
echo
echo " export OVERWRITE_FIXTURES=true"
echo
echo "and then rerun this script."
echo
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
echo "to update fixtures for CI."
echo
exit 1
fi

View File

@ -17,16 +17,30 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--github-url dcneiner/Downloadify \
--git-file-glob '*.html,*.txt' \
--structured-output-dir github-downloadify-output \
--reprocess \
--preserve-downloads \
--verbose
if ! diff -ru test_unstructured_ingest/expected-structured-output/github-downloadify github-downloadify-output ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, copy the outputs from"
echo "github-downloadify-output/ to test_unstructured_ingest/expected-structured-output/github-downloadify/ after running"
echo
echo " PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --github-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose"
echo
exit 1
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
cp github-downloadify-output/* test_unstructured_ingest/expected-structured-output/github-downloadify/
elif ! diff -ru test_unstructured_ingest/expected-structured-output/github-downloadify github-downloadify-output ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
echo
echo " export OVERWRITE_FIXTURES=true"
echo
echo "and then rerun this script."
echo
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
echo "to update fixtures for CI."
echo
exit 1
fi

View File

@ -14,15 +14,29 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
--s3-anonymous \
--structured-output-dir s3-small-batch-output \
--preserve-downloads \
--reprocess
if ! diff -ru test_unstructured_ingest/expected-structured-output/s3-small-batch s3-small-batch-output ; then
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
cp s3-small-batch-output/small-pdf-set/* test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/
elif ! diff -ru test_unstructured_ingest/expected-structured-output/s3-small-batch s3-small-batch-output ; then
echo
echo "There are differences from the previously checked-in structured outputs."
echo
echo "If these differences are acceptable, copy the outputs from"
echo "s3-small-batch-output/ to test_unstructured_ingest/expected-structured-output/s3-small-batch/ after running"
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
echo
echo " PYTHONPATH=. python examples/ingest/s3-small-batch/main.py --structured-output-dir s3-small-batch-output"
echo " export OVERWRITE_FIXTURES=true"
echo
echo "and then rerun this script."
echo
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
echo "to update fixtures for CI,"
echo
exit 1
fi

View File

@ -5,6 +5,9 @@ set -eux -o pipefail
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "$SCRIPT_DIR"/.. || exit 1
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
export OMP_THREAD_LIMIT=1
./test_unstructured_ingest/test-ingest-s3.sh
./test_unstructured_ingest/test-ingest-azure.sh
./test_unstructured_ingest/test-ingest-github.sh