mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-12 07:34:09 +00:00
build: script to update all ingest fixtures, add azure ingest fixtures (#367)
- Updates CI to install tesseract version 5.3.0 (better than 4.x in various ways incl. perf.). - Adds azure expected output fixtures for more useful reference points and as a repro for Some PDF's with scanned images return empty elements #346 . - Adds a script to regenerate ingest test fixtures that is run in an ubuntu docker container (like CI), with the same version of tesseract. See the comments in scripts/ingest-test-fixtures-update.sh for details. - Updates expected outputs with above script. - Updates individual test-ingest scripts to update expected .json output if OVERWRITE_FIXTURES=true.
This commit is contained in:
parent
7ec85272b7
commit
7b44bcd6e0
5
.github/workflows/ci.yml
vendored
5
.github/workflows/ci.yml
vendored
@ -105,7 +105,10 @@ jobs:
|
||||
source .venv/bin/activate
|
||||
make install-detectron2
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libmagic-dev poppler-utils tesseract-ocr libreoffice pandoc
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get install -y tesseract-ocr
|
||||
tesseract --version
|
||||
make test
|
||||
make check-coverage
|
||||
make install-ingest-s3
|
||||
|
||||
28
Makefile
28
Makefile
@ -26,12 +26,12 @@ install-ci: install-base-pip-packages install-nltk-models install-huggingface in
|
||||
.PHONY: install-base-pip-packages
|
||||
install-base-pip-packages:
|
||||
python3 -m pip install pip==${PIP_VERSION}
|
||||
pip install -r requirements/base.txt
|
||||
python3 -m pip install -r requirements/base.txt
|
||||
|
||||
.PHONY: install-huggingface
|
||||
install-huggingface:
|
||||
python3 -m pip install pip==${PIP_VERSION}
|
||||
pip install -r requirements/huggingface.txt
|
||||
python3 -m pip install -r requirements/huggingface.txt
|
||||
|
||||
.PHONE: install-nltk-models
|
||||
install-nltk-models:
|
||||
@ -40,52 +40,52 @@ install-nltk-models:
|
||||
|
||||
.PHONY: install-test
|
||||
install-test:
|
||||
pip install -r requirements/test.txt
|
||||
python3 -m pip install -r requirements/test.txt
|
||||
|
||||
.PHONY: install-dev
|
||||
install-dev:
|
||||
pip install -r requirements/dev.txt
|
||||
python3 -m pip install -r requirements/dev.txt
|
||||
|
||||
.PHONY: install-build
|
||||
install-build:
|
||||
pip install -r requirements/build.txt
|
||||
python3 -m pip install -r requirements/build.txt
|
||||
|
||||
.PHONY: install-ingest-google-drive
|
||||
install-ingest-google-drive:
|
||||
pip install -r requirements/ingest-google-drive.txt
|
||||
python3 -m pip install -r requirements/ingest-google-drive.txt
|
||||
|
||||
## install-ingest-s3: install requirements for the s3 connector
|
||||
.PHONY: install-ingest-s3
|
||||
install-ingest-s3:
|
||||
pip install -r requirements/ingest-s3.txt
|
||||
python3 -m pip install -r requirements/ingest-s3.txt
|
||||
|
||||
.PHONY: install-ingest-azure
|
||||
install-ingest-azure:
|
||||
pip install -r requirements/ingest-azure.txt
|
||||
python3 -m pip install -r requirements/ingest-azure.txt
|
||||
|
||||
.PHONY: install-ingest-github
|
||||
install-ingest-github:
|
||||
pip install -r requirements/ingest-github.txt
|
||||
python3 -m pip install -r requirements/ingest-github.txt
|
||||
|
||||
.PHONY: install-ingest-gitlab
|
||||
install-ingest-gitlab:
|
||||
pip install -r requirements/ingest-gitlab.txt
|
||||
python3 -m pip install -r requirements/ingest-gitlab.txt
|
||||
|
||||
.PHONY: install-ingest-reddit
|
||||
install-ingest-reddit:
|
||||
pip install -r requirements/ingest-reddit.txt
|
||||
python3 -m pip install -r requirements/ingest-reddit.txt
|
||||
|
||||
.PHONY: install-ingest-wikipedia
|
||||
install-ingest-wikipedia:
|
||||
pip install -r requirements/ingest-wikipedia.txt
|
||||
python3 -m pip install -r requirements/ingest-wikipedia.txt
|
||||
|
||||
.PHONY: install-unstructured-inference
|
||||
install-unstructured-inference:
|
||||
pip install -r requirements/local-inference.txt
|
||||
python3 -m pip install -r requirements/local-inference.txt
|
||||
|
||||
.PHONY: install-detectron2
|
||||
install-detectron2:
|
||||
pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc#egg=detectron2"
|
||||
python3 -m pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc#egg=detectron2"
|
||||
|
||||
## install-local-inference: installs requirements for local inference
|
||||
.PHONY: install-local-inference
|
||||
|
||||
24
docker/ubuntu-22/Dockerfile
Normal file
24
docker/ubuntu-22/Dockerfile
Normal file
@ -0,0 +1,24 @@
|
||||
# Dockerfile that approximates the CI image
|
||||
#
|
||||
# Mainly useful for updating test-ingest fixtures
|
||||
|
||||
FROM ubuntu:22.04
|
||||
|
||||
COPY scripts/setup_ubuntu.sh scripts/setup_ubuntu.sh
|
||||
|
||||
RUN bash scripts/setup_ubuntu.sh root
|
||||
|
||||
COPY requirements/ requirements/
|
||||
COPY Makefile Makefile
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
RUN source ~/.bashrc && pyenv virtualenv 3.8.15 unstructured && \
|
||||
source ~/.pyenv/versions/unstructured/bin/activate && \
|
||||
make install-ci && \
|
||||
make install-detectron2 && \
|
||||
make install-ingest-s3 && \
|
||||
make install-ingest-azure && \
|
||||
make install-ingest-github && \
|
||||
make install-ingest-gitlab && \
|
||||
make install-ingest-wikipedia
|
||||
11
scripts/docker-build-ubuntu.sh
Executable file
11
scripts/docker-build-ubuntu.sh
Executable file
@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Mainly useful for building an image from which to update test-ingest fixtures
|
||||
|
||||
set -eu -o pipefail
|
||||
|
||||
# Change to the root of the repository
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
docker build -t unstructured-ubuntu:latest --progress plain -f docker/ubuntu-22/Dockerfile .
|
||||
49
scripts/ingest-test-fixtures-update.sh
Executable file
49
scripts/ingest-test-fixtures-update.sh
Executable file
@ -0,0 +1,49 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Structured .json output from PDF's or images may differ subtly (or not so subtly)
|
||||
# based on the version of tesseract, its dependencies, and chip architecture.
|
||||
#
|
||||
# To update ingest-test expected outputs (structured .json files), this script:
|
||||
# * builds an ubuntu image that
|
||||
# * matches CI with respect to tesseract and OS deps
|
||||
# * installs python dependencies from the local requirements/ directory
|
||||
# * runs each test ingest script with OVERWRITE_FIXTURES=true
|
||||
# * so updates are written to test_unstructured_ingest/expected-structured-output/
|
||||
# * using local unstructured/ directory (i.e. from local git branch)
|
||||
#
|
||||
# It is recommended to run this script on x86_64 hardware.
|
||||
|
||||
set -eu -o pipefail
|
||||
|
||||
# Change to the root of the repository
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
ARCHITECTURE=$(uname -m)
|
||||
|
||||
if [ "$ARCHITECTURE" != "x86_64" ]; then
|
||||
echo "Warning: This script is designed to run on x86_64 hardware, but you're running on $ARCHITECTURE."
|
||||
fi
|
||||
|
||||
./scripts/docker-build-ubuntu.sh
|
||||
|
||||
# Warn the user if they have an old image
|
||||
IMAGE_NAME="unstructured-ubuntu:latest"
|
||||
CREATION_TIMESTAMP=$(docker inspect --format='{{.Created}}' "$IMAGE_NAME")
|
||||
CREATION_DATE=$(date -d "$CREATION_TIMESTAMP" +%s)
|
||||
CURRENT_DATE=$(date +%s)
|
||||
AGE_DAYS=$(( (CURRENT_DATE - CREATION_DATE) / 86400 ))
|
||||
if [ "$AGE_DAYS" -gt 6 ]; then
|
||||
echo "WARNING: The image \"$IMAGE_NAME\" is more than 7 days old ($AGE_DAYS days)."
|
||||
echo "You may want to 'docker rmi $IMAGE_NAME' and rerun this script if it is not current."
|
||||
fi
|
||||
|
||||
docker run --rm -v "$SCRIPT_DIR"/../unstructured:/root/unstructured -v \
|
||||
"$SCRIPT_DIR"/../test_unstructured_ingest:/root/test_unstructured_ingest \
|
||||
-w /root "$IMAGE_NAME" \
|
||||
bash -c "export OVERWRITE_FIXTURES=true && source ~/.bashrc && pyenv activate unstructured &&
|
||||
./test_unstructured_ingest/test-ingest-azure.sh &&
|
||||
./test_unstructured_ingest/test-ingest-github.sh &&
|
||||
./test_unstructured_ingest/test-ingest-biomed-api.sh &&
|
||||
./test_unstructured_ingest/test-ingest-biomed-path.sh &&
|
||||
./test_unstructured_ingest/test-ingest-s3.sh"
|
||||
@ -37,7 +37,7 @@ $sudo $pac upgrade -y
|
||||
|
||||
#### Utils
|
||||
# Prerequisites
|
||||
$sudo env DEBIAN_FRONTEND="noninteractive" $pac install -y gcc wget tar curl make xz-utils build-essential tzdata
|
||||
$sudo env DEBIAN_FRONTEND="noninteractive" $pac install -y gcc wget tar curl make xz-utils build-essential tzdata rsync
|
||||
|
||||
#### Git
|
||||
# Install git
|
||||
@ -82,9 +82,13 @@ $sudo $pac install -y libgl1
|
||||
# Install poppler
|
||||
$sudo $pac install -y poppler-utils
|
||||
|
||||
#### Tesseract
|
||||
# Install tesseract as well as Russian language
|
||||
$sudo $pac install -y tesseract-ocr libtesseract-dev tesseract-ocr-rus libreoffice pandoc
|
||||
#### OpenOffice / MSOffice doc conversion capabilities
|
||||
$sudo $pac install -y libreoffice pandoc
|
||||
|
||||
# Install tesseract 5 as well as Russian language
|
||||
$sudo $pac install -y software-properties-common
|
||||
$sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
$sudo $pac install -y tesseract-ocr libtesseract-dev tesseract-ocr-rus
|
||||
|
||||
#### libmagic
|
||||
$sudo $pac install -y libmagic-dev
|
||||
|
||||
@ -0,0 +1,378 @@
|
||||
[
|
||||
{
|
||||
"element_id": "833e3f9d4af02845c670c31e2d6d4f9a",
|
||||
"text": "Skills for Biomedical Data",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "64b2134f054446d473fce1b05d4d4c94",
|
||||
"text": "Maryam Zaringhalam, PhD, AAAS Science & Technology Policy Fellow",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "f3416e4bccede2117fed6bc61910bc18",
|
||||
"text": "F. Huerta, PhD, Associate Director of NLM for Program Development and NLM of Data Science and Open Science Initiatives",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "f14031943b3f1e34dcfc27bf02c38c09",
|
||||
"text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs. Suggested high-level core skills include:",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "1f0a7c85704bf89e1ec17d6fe40bf29b",
|
||||
"text": "General biomedical subject matter knowledge: biomedical data scientists have a general working knowledge of the principles of biology, bioinformatics, basic clinical science;",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "8a5926da311fdb0da8c4cac8e15ba79d",
|
||||
"text": "Programming language expertise: biomedical data scientists should be fluent in least one programming language (typically R and/or Python);",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "fdc7dc13c15e758445efae7d34a23951",
|
||||
"text": "Predictive analytics, modeling, and machine learning: while a range of methods may be useful, predictive analytics, modeling, and machine learning as especially important skills in biomedical data science;",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "52c66df43382ed215c2445f81ad76010",
|
||||
"text": "Team science and scientific communication: “soft” skills, like the ability to work on teams and communicate effectively in both verbal and written venues, may be important as the more technical skills typically associated with data science.",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "5a9a1f072c06a3ae844a187dab3b9e32",
|
||||
"text": "Responsible data stewardship: a successful data scientist must be able to best practices for data management and stewardship, as well as conduct research an ethical manner that maintains data security and privacy.",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "5e3d4670749a0f3753fa4bb1b328d156",
|
||||
"text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "4c5f925a7db08289f19dbe8635d8b4cd",
|
||||
"text": "this commitment, a recent report to the NLM Director recommended working across NIH to identify and develop core skills required of a biomedical data scientist to consistency across the cohort of NIH-trained data scientists. This report provides a set of recommended core skills based on analysis of current BD2K-funded training programs, biomedical data science job ads, and practicing members of the current data science workforce.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "f26d07e6b71e42596791a241e2417931",
|
||||
"text": "Methodology",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "153010aa2c8aa0a0e54bdac5e14340be",
|
||||
"text": "a) Responses to a",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "9b773eceddf8b7622fdec8bb3c8657ff",
|
||||
"text": "Kaggle",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "811f2a14b9850c9d9c7562f29228754b",
|
||||
"text": "survey",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "13d71f8611c0248d58ffa4d1230da73e",
|
||||
"text": "of over",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "d03502c43d74a30b936740a9517dc4ea",
|
||||
"text": ",",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "b9155b5adc04d5539379f3fc62b33711",
|
||||
"text": "self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use. b) Data science skills taught in BD",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "07f54d1cb2e96bc062c55121de3f6882",
|
||||
"text": "K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "24349c8054862cb8cbd4d857d096943e",
|
||||
"text": "BD",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "caa5fc58a6d57578858155571d5d4f79",
|
||||
"text": "K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "8b67c1eff9f0e59b2d8a11195bc13ce1",
|
||||
"text": ") statistics and math skills; (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "4b021f7187c84f22e863e931047e2fc2",
|
||||
"text": ") computer science; (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "59e087d0c9fcb1a8cc6d5448ce5fad04",
|
||||
"text": ") subject knowledge; (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "ae19ecd18e97da5a942738ed9c37b235",
|
||||
"text": ") general skills, like communication and teamwork. The coding schema is detailed in Appendix A. c) Desired skills identified from data science-related job ads.",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "d14cf7578b76bba89cd14f7c65d27dce",
|
||||
"text": "job ads from government (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "ff8f02c33b45fd488b21342ad816f985",
|
||||
"text": "%), academia (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "7b953c4510d51c8c49bdb1f72208e813",
|
||||
"text": "%), industry (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "b991274d798760827347db84d4c50aed",
|
||||
"text": "%), and the nonprofit sector (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "d6dc0d0c11a894b2ce64fcc8af4cfe27",
|
||||
"text": "%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "a119d3cc0dc7f3ac85725acf60229415",
|
||||
"text": "Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "153010aa2c8aa0a0e54bdac5e14340be",
|
||||
"text": "a) Responses to a",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "13d71f8611c0248d58ffa4d1230da73e",
|
||||
"text": "of over",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "d03502c43d74a30b936740a9517dc4ea",
|
||||
"text": ",",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "86bb1e8982150210536b8273bbe0b53d",
|
||||
"text": "self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use.",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "73b8242ab49aacecd5561fc18ea23239",
|
||||
"text": "b) Data science skills taught in BD",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "07f54d1cb2e96bc062c55121de3f6882",
|
||||
"text": "K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "24349c8054862cb8cbd4d857d096943e",
|
||||
"text": "BD",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "caa5fc58a6d57578858155571d5d4f79",
|
||||
"text": "K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "8b67c1eff9f0e59b2d8a11195bc13ce1",
|
||||
"text": ") statistics and math skills; (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "4b021f7187c84f22e863e931047e2fc2",
|
||||
"text": ") computer science; (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "59e087d0c9fcb1a8cc6d5448ce5fad04",
|
||||
"text": ") subject knowledge; (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "c865029d7025ef68891ec5c426b9aaa3",
|
||||
"text": ") general skills, like communication and teamwork. The coding schema is detailed in Appendix A.",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "63ba3341ed94f1b2d89198e84757f871",
|
||||
"text": "b) Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "52eeeeac5a03bf69edb6126abb21f1d5",
|
||||
"text": "Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "6a436d034b4636ebebdfee2765d3ac9e",
|
||||
"text": "Analysis of the above data provided insights into the current state of biomedical data training, as well as a view into data science-related skills likely to be needed to prepare BDS workforce to succeed in the future. Together, these analyses informed for core skills necessary for a competitive biomedical data scientist.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "94ba2c5be803a3cb405fc51dada2532d",
|
||||
"text": "2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -0,0 +1,322 @@
|
||||
[
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 2
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -0,0 +1,322 @@
|
||||
[
|
||||
{
|
||||
"element_id": "0bd6458cb49a638f3ccff515b9433cb8",
|
||||
"text": "———eee eee eee\n\nInstructions for Form 3115\n(Rev. November 1987)\n\nAnniicatinn far Chancain Acnninting Mothad\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "41f3d9c83b2b4679195c9796134fd8f5",
|
||||
"text": "(Section references are to the Internal Revenue Code unless otherwise noted.)\n",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "97968e4ba14bd2d082a70ec61ef2d9b1",
|
||||
"text": "Long-term contracts.—If you are required to\nchange your method of accounting for long-term\ncontracts under section",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "ac843848ae2f4c656203dee90cdc207c",
|
||||
"text": ", see Notice",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "3973e022e93220f9212c18d0d0c543ae",
|
||||
"text": "-",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "32ebb1abcc1c601ceb9c4e3c4faba0ca",
|
||||
"text": "(",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "8a5edab282632443219e051e4ade2d1d",
|
||||
"text": "/",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "8a5edab282632443219e051e4ade2d1d",
|
||||
"text": "/",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "bb01c44bd646ab29df9cea6459a3499b",
|
||||
"text": "),",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "3973e022e93220f9212c18d0d0c543ae",
|
||||
"text": "-",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "29b33c1e0aea8247e6576bd9ad14448e",
|
||||
"text": "IRB",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "f0d2beb7f43493694a91137e8e65b5f3",
|
||||
"text": ", for the notification\nprocedures that must be followed.\n\nOther methods. —Unless the Service has\npublished a regulation or procedure to the\ncontrary, all other changes in accounting\nmethods required by the Act are automatically\nconsidered to be approved by the Commissioner.\nExamples of method changes automatically\napproved by the Commissioner are those changes\nrequired to effect: (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "13f2a282f705590fbe7b6ce15b08862a",
|
||||
"text": ") the repeal of the reserve\nmethod for bad debts of taxpayers other than\nfinancial institutions (Act section",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "fd0f38844b9901d3a4e7c44630346145",
|
||||
"text": "); (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "9820f79275e683f5afe3f2f1283de4ca",
|
||||
"text": ") the\nrepeal of the installment method for sales under\na revolving credit plan (Act section",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "fd0f38844b9901d3a4e7c44630346145",
|
||||
"text": "); (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "a98378f4a88db65dff42b7d8bd75be92",
|
||||
"text": ") the\nInclusion of mcome attributable to the sale or\nfurnishing of utility services no later than the year\nin which the services were provided to customers\n(Act section",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "25d6eaf57eebce49267b71ce2f347a03",
|
||||
"text": "); and (",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "3cb57c50002187a715e1c5048e643c65",
|
||||
"text": ") the repeal of the\ndeduction for qualified discount coupons (Act\nsection",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e9d9ab5eb5ff32a31a32bda940a33b7a",
|
||||
"text": "). Do not file Form",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "f88cf27baa9e77b38c7d9c688ac90417",
|
||||
"text": "for these\nchanges.\n\nTime and Dinne fay Cling",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "bd913e19b877497b5480c528c96fd0f6",
|
||||
"text": "Signature\n\nIndivideale\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "0c97452e61a431a9bced8091af69e908",
|
||||
"text": "Individuals.—An individual desiring the change\nshould sign the application. Ifthe application\npertains to a husband and wife filing a joint\nIncome tax return, the names of both should\nappear in the heading and both should sign\nPartnerships.—The form should be signed with\nthe partnership name followed by the signature\nof one of the general partners and the words\n“General Partner.”\nCorporations, cooperatives, and insurance\ncompanies.—The form should show the name of\nthe corporation, cooperative, or insurance\nCompany and the signature of the president, vice\npresident, treasurer, assistant treasurer, or chief\naccounting officer (such as tax officer) authorized\ntosign, and his or her official title. Receivers,\ntrustees, or assignees must sign any application\nthey are required to file, For a subsidiary\ncorporation filing a consolidated return with its\nparent, the form should be signed by an officer of\nthe parent corporation,\nluciaries.—The-form should show the name\nof the estate or trust and be signed by the\nfiduciary, personal representative, executor,\nexecutrix, administrator, administratrx, etc’,\nhaving legal authority to'sign, and his or her ttle.\nPreparer other than partner, officer, etc.—The\nsignature of the individual preparing the\napplication should appear in the space provided\non page",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "586e989b479e4362ebe28a6954c1427b",
|
||||
"text": "If the individual or firm is also authorized to",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "226fa83297914d5195e002508d61fb1d",
|
||||
"text": "General Instructions\n\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "d0e1e01dcbc7b4dfa2df8fe1d7c71acc",
|
||||
"text": "General Instructions\nPurpose of Form\n\nPile thse Seen te vepsect a phepee se\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "f0e951e5bcb4a6070fa6672b37822348",
|
||||
"text": "Purpose of Form\n\nCin bce Secon te cece cget.\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "03fe77cbc1e2a87cdf64a64b839545b5",
|
||||
"text": "alata\nGenerally, applicants must complete Section\n\n‘A. In addition, complete the appropriate sections\n\n(B:1 through H) for which a change is desired.\n\nYou must give all relevant facts. including a\n\n",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "7fc74bd7792c99bb71777aeaea5bf987",
|
||||
"text": "Time and Place for Filing\namacall, ammlinapte pret file trie\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "efd2dea48b678ae3052a8fae284dcd9b",
|
||||
"text": "on page ©.\n\nIf the individual or firm is also authorized to\nrepresent the applicant before the IRS, receive\na copy of the requested ruling, or perform any\nother act(s), the power of attorney must reflect\nsuch authorization(s).\n\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "8b35e7c212710b1099b675ce9394fb47",
|
||||
"text": "Se NB ON\n\nState whether you desire a conference in the\nNational Office if the Service proposes to\ndisapprove your application.\n\n",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "7c9e868b449a25434af63386e8c72962",
|
||||
"text": "Affiliated Groups\n\nTavmayare that ara mam)\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "d6d128db1d06743816667d277159e1e9",
|
||||
"text": "Changes to Accounting Methods\nRequired Under the Tax Reform Act\nof 1986\n\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "4f022ad16f9de29b399fe4e77ebec3da",
|
||||
"text": "Uniform capitalization rules and limitation on\n‘cash method. —If you are required to change\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "231967b6e23633ce4b794ba4d92195b5",
|
||||
"text": "Specific Instructions\nSection A\n\nItem Sa. nage 1 «-\"Taxahle incams\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "1dda7db8eaa236f190c9f1385666af36",
|
||||
"text": "anearly application.\nNote: if this form is being filed in accordance\nwith Rev. Proc. 74-11, see Section G below.\n\na.\n\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "b4a7f10875d4301b0cbce5eff69f64df",
|
||||
"text": "Late Applications\n\nMe cms anmiimation ie Fler\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e4a97fbdd3d6f33335ec71deba7af01f",
|
||||
"text": "includes total sales (net of returns and\nallowances) and all amounts received for\nservices. in addition, gross receipts include any\nincome from investments and from incidental or\noutside sources (e.g., interest, dividends, rents,\nroyalties, and annuities). However, if you area\nresaler of personal property, exclude from gross\nreceipts any amounts not derived in the ordinary\ncourse of a trade or business. Gross receipts do\nnot include amounts received for sales taxes if,\ntunder the applicable state or local law, the taxis\nlegally imposed on the purchaser of the good or\nservice, and the taxpayer merely collects and\nremits the tax to the taxing authority.\n",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "786c2aaee9fcae020f4b01a298e4d141",
|
||||
"text": "Disregard the instructions under Time and\nPlace for Filing and Late Applications. instead,\nattach Form 3115 to your income tax return for\nthe year of change; do not file it separately. Also\ninclude on a separate statement accompanying\nthe Form 3115 the period over which the section\n481(2) adjustment will be taken into account and\nthe basis for that conclusion. Identify the\n\n",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "f2db523f6d52de1e67f6e8c1c81a8069",
|
||||
"text": "Identifying Number\n",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
}
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,226 @@
|
||||
[
|
||||
{
|
||||
"element_id": "41f6e17bf5e9a407fcca74e902f802a0",
|
||||
"text": "News Around NOAA",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "aa589c25dc22dcc8a75baba1244e6c8f",
|
||||
"text": "National Program",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "62c26d2e16774d2334bd804c7bb6a711",
|
||||
"text": "Are You Weather-Ready for the Spring?",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "32709cd3bec72640bbbe32f58e6e23f6",
|
||||
"text": "Weather.gov >",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "2661da76db570876b075083aaeeaee55",
|
||||
"text": "News Around NOAA > Are You Weather-Ready for the Spring?",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "fab6c4df083f0fb6f324fff65b652c86",
|
||||
"text": "Weather Safety Air Quality Beach Hazards Cold Cold Water Drought Floods Fog Heat Hurricanes Lightning Safety Rip Currents Safe Boating Space Weather Sun (Ultraviolet Radiation) Thunderstorms & Tornadoes Tornado Tsunami Wildfire Wind Winter",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "45c26cf3457e6d18985a435e2c0fcc65",
|
||||
"text": "Safety Campaigns Seasonal Safety Campaigns #SafePlaceSelfie Deaf & Hard of Hearing Intellectual Disabilities Spanish-language Content The Great Outdoors",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "77f5acc603de9a165ed87a5c3fbaf14a",
|
||||
"text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Español",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "8f19bcaabbd1bafa5e9826ac69766c8b",
|
||||
"text": "Education NWS Education Home Be A Force Of Nature WRN Kids Flyer Wireless Emergency Alerts NOAA Weather Radio Mobile Weather Brochures Hourly Weather Forecast Citizen Science Intellectual Disabilities",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "1245f9cf9e019713391e4ee3bac54a63",
|
||||
"text": "Collaboration Get Involved Social Media WRN Ambassadors Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only) SKYWARN",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "23dfa7f98424dbf86e00b3d500096dfa",
|
||||
"text": "News & Events Latest News Calendar Meetings & Workshops NWS Aware Newsletter",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "93202df2ec7081b28b47901b5c287a5a",
|
||||
"text": "International",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "e53d6a9c615bdf1a8d7b98a67cade488",
|
||||
"text": "About Contact Us What is WRN? WRN FAQ WRN Brochure Hazard Simplification IDSS Brochure Roadmap Strategic Plan WRN International Social Science",
|
||||
"type": "ListItem",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "6cbcf8c11f8c0781bd9ecc7f67169ff0",
|
||||
"text": "The spring season is all about change – a rebirth both literally and figuratively. Even though the spring season doesn’t officially (astronomically, that is) begin until March 20 this year, climatologically, it starts March 1.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "7184168da442c6ef28553b274bf2be8f",
|
||||
"text": "As cold winter nights are replaced by the warmth of longer daylight hours, the National Weather Service invites you to do two important things that may save your life or the life of a loved one.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "f3be9748ecd68b20d706548129baa22d",
|
||||
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "126c3cd201fb259cfeabc6bffc0b5473",
|
||||
"text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content – everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "c1944fb037f3e1cb14969bc59a7dd9c2",
|
||||
"text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring’s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "fa1b939ef6159d95260bc095f58ebbc2",
|
||||
"text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "47d5d0d27a35a36d7467dfc8b6e089b3",
|
||||
"text": "US Dept of Commerce\n National Oceanic and Atmospheric Administration\n National Weather Service\n News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us.",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "129c678fce59acee7ac6a6fdb67b6310",
|
||||
"text": "Disclaimer",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "3c96caaebd949e39d25b3ccf4133c5d8",
|
||||
"text": "Information Quality",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "b79cac926e0b2e347e72cc91d5174037",
|
||||
"text": "Help",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "4c4e436f9a453c776dbf011f98d932d6",
|
||||
"text": "Glossary",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "506ff394621596dd88138642eddfc1e4",
|
||||
"text": "Privacy Policy",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "c70ae8c30a61c450d2c5148d1b6a0447",
|
||||
"text": "Freedom of Information Act (FOIA)",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "5d8c71abc527284cd463aa58f3f48098",
|
||||
"text": "About Us",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "a8a00c355d2fa1461d532a1088274f32",
|
||||
"text": "Career Opportunities",
|
||||
"type": "Title",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -48,8 +48,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "4bf1c98e38b4b85a7bb1bedb76383117",
|
||||
"text": "(http: ||creativecommons. org/licenses/by- nc- -nd/4, 0/). ",
|
||||
"element_id": "3ec82e9a5e3b39e7710305fd9be4e8d5",
|
||||
"text": "(http: ||creativecommons. org/licenses/by- nc- -nd/4, 0/).",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
@ -288,8 +288,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "335719aca05b1cfa324181802cd3f003",
|
||||
"text": "Eemoos—{1So *2055 . —é —\"15 — Control2 — 8250.0000001 0.00001 0.001 o1Current Density (A/cm2)",
|
||||
"element_id": "07e8e4f9666bde08d71c1617f69eddd1",
|
||||
"text": "Potential (Vv)nm°in°}ary=ES 724250.0000001T T T0.00001 0.001 olCurrent Density (A/cm2)",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 4
|
||||
@ -344,8 +344,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "1da08457018fe7f4244669a3023cfc2f",
|
||||
"text": "oe ae TRE OaEmcee Det: DOE eee ",
|
||||
"element_id": "6959a323ee23c858c3b1411b05db6ebf",
|
||||
"text": "SEM HV: Q0KY WD: 14.89 rmrm‘DEM MAO: 209 x Det: DOE Pecforsence In nenospact",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 5
|
||||
@ -360,8 +360,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "73504a9f03b3be882a3ecd5b862079af",
|
||||
"text": "Fol ieadLSpena ",
|
||||
"element_id": "b0a40261108ea21c6136d3172b4cd987",
|
||||
"text": "gEOOfeSem ny. 200 Rv",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 5
|
||||
@ -376,8 +376,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "c9547666563138eb19ca35a3b250a190",
|
||||
"text": "atSEM HY: 20.0KU ",
|
||||
"element_id": "bb1b80b1cdf7f88847e1c8231fb4aae7",
|
||||
"text": "aSEM HY: 20.0KV",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 5
|
||||
@ -408,8 +408,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "73e781ff9b3df24b670a347e44da068f",
|
||||
"text": " ouH,;COCHNY OHOH",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 6
|
||||
|
||||
@ -48,8 +48,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "958d8b4aff103d20c38caf156a80c238",
|
||||
"text": " (http: ||creativecommons. org/licenses/by- nce-nd/4.0/). ",
|
||||
"element_id": "b16a14378c7e3641edaab4832d548e08",
|
||||
"text": "(http: ||creativecommons. org/licenses/by- nce-nd/4.0/).",
|
||||
"type": "NarrativeText",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
|
||||
@ -8,8 +8,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
@ -112,8 +112,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 3
|
||||
@ -192,8 +192,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 5
|
||||
@ -216,24 +216,24 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "b42bc8a6e8c708b898dc318090243df5",
|
||||
"text": ": ¥ A4 : ¢@Nyy4 4LANIK¥||SW",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 5
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "955571c35ce6527872230bf53595aa9e",
|
||||
"text": "eo re ",
|
||||
"element_id": "06a308d0660e39112e3611ca071fc163",
|
||||
"text": "i ee ee",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 5
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 5
|
||||
@ -288,8 +288,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 6
|
||||
@ -344,8 +344,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 7
|
||||
@ -432,8 +432,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 9
|
||||
@ -472,8 +472,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 10
|
||||
|
||||
@ -8,8 +8,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 1
|
||||
@ -72,8 +72,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 3
|
||||
@ -272,8 +272,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 6
|
||||
@ -376,8 +376,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "36a9e7f1c95b82ffb99743e0c5c4ce95",
|
||||
"text": " ",
|
||||
"element_id": "e3b0c44298fc1c149afbf4c8996fb924",
|
||||
"text": "",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 8
|
||||
@ -392,8 +392,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "28285134b5841ec7151f35f18434efe9",
|
||||
"text": "AY ny ",
|
||||
"element_id": "bd051434f4157e51fd1185e80bd847f8",
|
||||
"text": "AY nO",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 9
|
||||
@ -424,8 +424,8 @@
|
||||
}
|
||||
},
|
||||
{
|
||||
"element_id": "4ad4b4c50d891d618b06f85c6b398770",
|
||||
"text": " y[he ere AE,BEISSS ",
|
||||
"element_id": "533260e4d7afb457efe61c53a93718bf",
|
||||
"text": "y[hee AESUROa er",
|
||||
"type": "FigureCaption",
|
||||
"metadata": {
|
||||
"page_number": 9
|
||||
|
||||
@ -8,10 +8,30 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--remote-url abfs://container1/ \
|
||||
--azure-account-name azureunstructured1 \
|
||||
--structured-output-dir azure-ingest-output \
|
||||
--reprocess \
|
||||
--num-processes 2
|
||||
|
||||
if [ "$(find 'azure-ingest-output' -type f -printf '.' | wc -c)" -ne 5 ]; then
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
|
||||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
|
||||
|
||||
cp azure-ingest-output/* test_unstructured_ingest/expected-structured-output/azure-blob-storage/
|
||||
|
||||
elif ! diff -ru test_unstructured_ingest/expected-structured-output/azure-blob-storage azure-ingest-output ; then
|
||||
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
|
||||
echo
|
||||
echo " export OVERWRITE_FIXTURES=true"
|
||||
echo
|
||||
echo "and then rerun this script."
|
||||
echo
|
||||
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
|
||||
echo "to update fixtures for CI."
|
||||
echo
|
||||
echo "5 files should have been created."
|
||||
exit 1
|
||||
|
||||
fi
|
||||
|
||||
@ -15,18 +15,31 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--biomed-api-until "2019-01-02+00:03:10" \
|
||||
--structured-output-dir biomed-ingest-output-api \
|
||||
--num-processes 2 \
|
||||
--reprocess \
|
||||
--verbose \
|
||||
--download-dir biomed-download-api \
|
||||
--preserve-downloads
|
||||
|
||||
if ! diff -ru biomed-ingest-output-api test_unstructured_ingest/expected-structured-output/biomed-ingest-output-api ; then
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
echo "If these differences are acceptable, copy the outputs from"
|
||||
echo "biomed-ingest-output-api/ to test_unstructured_ingest/expected-structured-output/biomed-ingest-output-api/ after running"
|
||||
echo
|
||||
echo "PYTHONPATH=. ./unstructured/ingest/main.py --biomed-api-from '2019-01-02' --biomed-api-until '2019-01-02+00:03:10' --structured-output-dir biomed-ingest-output-api --num-processes 2 --verbose --download-dir biomed-download-api --preserve-downloads"
|
||||
echo
|
||||
exit 1
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
|
||||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
|
||||
|
||||
OWNER_GROUP=$(stat -c "%u:%g" test_unstructured_ingest/expected-structured-output/biomed-ingest-output-api)
|
||||
rsync -rv --chown="$OWNER_GROUP" biomed-ingest-output-api/ test_unstructured_ingest/expected-structured-output/biomed-ingest-output-api
|
||||
|
||||
elif ! diff -ru biomed-ingest-output-api test_unstructured_ingest/expected-structured-output/biomed-ingest-output-api ; then
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
|
||||
echo
|
||||
echo " export OVERWRITE_FIXTURES=true"
|
||||
echo
|
||||
echo "and then rerun this script."
|
||||
echo
|
||||
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
|
||||
echo "to update fixtures for CI."
|
||||
echo
|
||||
exit 1
|
||||
|
||||
fi
|
||||
|
||||
@ -19,14 +19,26 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--preserve-downloads
|
||||
|
||||
|
||||
if ! diff -ru biomed-ingest-output-path test_unstructured_ingest/expected-structured-output/biomed-ingest-output-path ; then
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
echo "If these differences are acceptable, copy the outputs from"
|
||||
echo "biomed-ingest-output-path/ to test_unstructured_ingest/expected-structured-output/biomed-ingest-output-path/ after running"
|
||||
echo
|
||||
echo "PYTHONPATH=. ./unstructured/ingest/main.py --biomed-path 'oa_pdf/07/07/sbaa031.073.PMC7234218.pdf' --structured-output-dir biomed-ingest-output-path --num-processes 2 --verbose --download-dir biomed-download-path --preserve-downloads"
|
||||
echo
|
||||
exit 1
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
|
||||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
|
||||
|
||||
OWNER_GROUP=$(stat -c "%u:%g" test_unstructured_ingest/expected-structured-output/biomed-ingest-output-path)
|
||||
rsync -rv --chown="$OWNER_GROUP" biomed-ingest-output-path/ test_unstructured_ingest/expected-structured-output/biomed-ingest-output-path
|
||||
|
||||
elif ! diff -ru biomed-ingest-output-path test_unstructured_ingest/expected-structured-output/biomed-ingest-output-path ; then
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
|
||||
echo
|
||||
echo " export OVERWRITE_FIXTURES=true"
|
||||
echo
|
||||
echo "and then rerun this script."
|
||||
echo
|
||||
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
|
||||
echo "to update fixtures for CI."
|
||||
echo
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@ -17,16 +17,30 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--github-url dcneiner/Downloadify \
|
||||
--git-file-glob '*.html,*.txt' \
|
||||
--structured-output-dir github-downloadify-output \
|
||||
--reprocess \
|
||||
--preserve-downloads \
|
||||
--verbose
|
||||
|
||||
if ! diff -ru test_unstructured_ingest/expected-structured-output/github-downloadify github-downloadify-output ; then
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
echo "If these differences are acceptable, copy the outputs from"
|
||||
echo "github-downloadify-output/ to test_unstructured_ingest/expected-structured-output/github-downloadify/ after running"
|
||||
echo
|
||||
echo " PYTHONPATH=. ./unstructured/ingest/main.py --github-url dcneiner/Downloadify --github-file-glob '*.html,*.txt' --structured-output-dir github-downloadify-output --verbose"
|
||||
echo
|
||||
exit 1
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
|
||||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
|
||||
|
||||
cp github-downloadify-output/* test_unstructured_ingest/expected-structured-output/github-downloadify/
|
||||
|
||||
elif ! diff -ru test_unstructured_ingest/expected-structured-output/github-downloadify github-downloadify-output ; then
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
|
||||
echo
|
||||
echo " export OVERWRITE_FIXTURES=true"
|
||||
echo
|
||||
echo "and then rerun this script."
|
||||
echo
|
||||
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
|
||||
echo "to update fixtures for CI."
|
||||
echo
|
||||
exit 1
|
||||
|
||||
fi
|
||||
|
||||
@ -14,15 +14,29 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--s3-url s3://utic-dev-tech-fixtures/small-pdf-set/ \
|
||||
--s3-anonymous \
|
||||
--structured-output-dir s3-small-batch-output \
|
||||
--preserve-downloads \
|
||||
--reprocess
|
||||
|
||||
if ! diff -ru test_unstructured_ingest/expected-structured-output/s3-small-batch s3-small-batch-output ; then
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
|
||||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
|
||||
|
||||
cp s3-small-batch-output/small-pdf-set/* test_unstructured_ingest/expected-structured-output/s3-small-batch/small-pdf-set/
|
||||
|
||||
elif ! diff -ru test_unstructured_ingest/expected-structured-output/s3-small-batch s3-small-batch-output ; then
|
||||
echo
|
||||
echo "There are differences from the previously checked-in structured outputs."
|
||||
echo
|
||||
echo "If these differences are acceptable, copy the outputs from"
|
||||
echo "s3-small-batch-output/ to test_unstructured_ingest/expected-structured-output/s3-small-batch/ after running"
|
||||
echo "If these differences are acceptable, overwrite by the fixtures by setting the env var:"
|
||||
echo
|
||||
echo " PYTHONPATH=. python examples/ingest/s3-small-batch/main.py --structured-output-dir s3-small-batch-output"
|
||||
echo " export OVERWRITE_FIXTURES=true"
|
||||
echo
|
||||
echo "and then rerun this script."
|
||||
echo
|
||||
echo "NOTE: You'll likely just want to run scripts/ingest-test-fixtures-update.sh on x86_64 hardware"
|
||||
echo "to update fixtures for CI,"
|
||||
echo
|
||||
exit 1
|
||||
|
||||
fi
|
||||
|
||||
@ -5,6 +5,9 @@ set -eux -o pipefail
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
cd "$SCRIPT_DIR"/.. || exit 1
|
||||
|
||||
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
|
||||
export OMP_THREAD_LIMIT=1
|
||||
|
||||
./test_unstructured_ingest/test-ingest-s3.sh
|
||||
./test_unstructured_ingest/test-ingest-azure.sh
|
||||
./test_unstructured_ingest/test-ingest-github.sh
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user