mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-08-15 20:27:37 +00:00
enhancement: clean pdf elements (bump unstructured-inference) (#790)
More deterministic element ordering when using hi_res PDF parsing strategy (from unstructured-inference bump to 0.5.4) Make large model available (from unstructured-inference bump to 0.5.3) Combine inferred elements with extracted elements (from unstructured-inference bump to 0.5.2) --------- Co-authored-by: Roman Isecke <roman@unstructured.io> Co-authored-by: Crag Wolfe <crag@unstructured.io>
This commit is contained in:
parent
642562beb5
commit
350bb1dad5
@ -1,7 +1,10 @@
|
||||
## 0.7.11-dev2
|
||||
## 0.7.11
|
||||
|
||||
### Enhancements
|
||||
|
||||
* More deterministic element ordering when using `hi_res` PDF parsing strategy (from unstructured-inference bump to 0.5.4)
|
||||
* Make large model available (from unstructured-inference bump to 0.5.3)
|
||||
* Combine inferred elements with extracted elements (from unstructured-inference bump to 0.5.2)
|
||||
* `partition_email` and `partition_msg` will now process attachments if `process_attachments=True`
|
||||
and a attachment partitioning functions is passed through with `attachment_partitioner=partition`.
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
#
|
||||
anyio==3.7.0
|
||||
# via httpcore
|
||||
argilla==1.10.0
|
||||
argilla==1.12.0
|
||||
# via -r requirements/base.in
|
||||
backoff==2.2.1
|
||||
# via argilla
|
||||
@ -53,7 +53,7 @@ idna==3.4
|
||||
# rfc3986
|
||||
importlib-metadata==6.7.0
|
||||
# via markdown
|
||||
joblib==1.2.0
|
||||
joblib==1.3.1
|
||||
# via nltk
|
||||
lxml==4.9.2
|
||||
# via
|
||||
@ -130,13 +130,12 @@ tqdm==4.65.0
|
||||
# via
|
||||
# argilla
|
||||
# nltk
|
||||
typer==0.9.0
|
||||
typer==0.7.0
|
||||
# via argilla
|
||||
typing-extensions==4.6.3
|
||||
typing-extensions==4.7.0
|
||||
# via
|
||||
# pydantic
|
||||
# rich
|
||||
# typer
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
# -c requirements/constraints.in
|
||||
|
@ -82,7 +82,7 @@ importlib-metadata==6.7.0
|
||||
# nbconvert
|
||||
importlib-resources==5.12.0
|
||||
# via jsonschema
|
||||
ipykernel==6.23.2
|
||||
ipykernel==6.23.3
|
||||
# via
|
||||
# ipywidgets
|
||||
# jupyter
|
||||
@ -121,7 +121,7 @@ jsonschema[format-nongpl]==4.17.3
|
||||
# nbformat
|
||||
jupyter==1.0.0
|
||||
# via -r requirements/dev.in
|
||||
jupyter-client==8.2.0
|
||||
jupyter-client==8.3.0
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-console
|
||||
@ -147,7 +147,7 @@ jupyter-core==5.3.1
|
||||
# qtconsole
|
||||
jupyter-events==0.6.3
|
||||
# via jupyter-server
|
||||
jupyter-server==2.6.0
|
||||
jupyter-server==2.7.0
|
||||
# via
|
||||
# nbclassic
|
||||
# notebook-shim
|
||||
@ -219,7 +219,7 @@ pip-tools==6.13.0
|
||||
# via -r requirements/dev.in
|
||||
pkgutil-resolve-name==1.3.10
|
||||
# via jsonschema
|
||||
platformdirs==3.6.0
|
||||
platformdirs==3.8.0
|
||||
# via
|
||||
# -c requirements/test.txt
|
||||
# jupyter-core
|
||||
@ -352,12 +352,12 @@ traitlets==5.9.0
|
||||
# nbformat
|
||||
# notebook
|
||||
# qtconsole
|
||||
typing-extensions==4.6.3
|
||||
typing-extensions==4.7.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/test.txt
|
||||
# ipython
|
||||
uri-template==1.2.0
|
||||
uri-template==1.3.0
|
||||
# via jsonschema
|
||||
virtualenv==20.23.1
|
||||
# via pre-commit
|
||||
@ -369,7 +369,7 @@ webencodings==0.5.1
|
||||
# via
|
||||
# bleach
|
||||
# tinycss2
|
||||
websocket-client==1.6.0
|
||||
websocket-client==1.6.1
|
||||
# via jupyter-server
|
||||
wheel==0.40.0
|
||||
# via
|
||||
|
@ -32,7 +32,7 @@ idna==3.4
|
||||
# requests
|
||||
jinja2==3.1.2
|
||||
# via torch
|
||||
joblib==1.2.0
|
||||
joblib==1.3.1
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# sacremoses
|
||||
@ -92,7 +92,7 @@ tqdm==4.65.0
|
||||
# transformers
|
||||
transformers==4.30.2
|
||||
# via -r requirements/huggingface.in
|
||||
typing-extensions==4.6.3
|
||||
typing-extensions==4.7.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# huggingface-hub
|
||||
|
@ -93,7 +93,7 @@ six==1.16.0
|
||||
# azure-core
|
||||
# azure-identity
|
||||
# isodate
|
||||
typing-extensions==4.6.3
|
||||
typing-extensions==4.7.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# azure-core
|
||||
|
@ -16,7 +16,7 @@ charset-normalizer==3.1.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# aiohttp
|
||||
discord-py==2.3.0
|
||||
discord-py==2.3.1
|
||||
# via -r requirements/ingest-discord.in
|
||||
frozenlist==1.3.3
|
||||
# via
|
||||
|
@ -40,7 +40,7 @@ google-api-core==2.11.1
|
||||
# via
|
||||
# google-cloud-core
|
||||
# google-cloud-storage
|
||||
google-auth==2.20.0
|
||||
google-auth==2.21.0
|
||||
# via
|
||||
# gcsfs
|
||||
# google-api-core
|
||||
@ -51,7 +51,7 @@ google-auth-oauthlib==1.0.0
|
||||
# via gcsfs
|
||||
google-cloud-core==2.3.2
|
||||
# via google-cloud-storage
|
||||
google-cloud-storage==2.9.0
|
||||
google-cloud-storage==2.10.0
|
||||
# via gcsfs
|
||||
google-crc32c==1.5.0
|
||||
# via google-resumable-media
|
||||
|
@ -17,9 +17,9 @@ charset-normalizer==3.1.0
|
||||
# requests
|
||||
google-api-core==2.11.1
|
||||
# via google-api-python-client
|
||||
google-api-python-client==2.90.0
|
||||
google-api-python-client==2.91.0
|
||||
# via -r requirements/ingest-google-drive.in
|
||||
google-auth==2.20.0
|
||||
google-auth==2.21.0
|
||||
# via
|
||||
# google-api-core
|
||||
# google-api-python-client
|
||||
|
@ -33,5 +33,5 @@ urllib3==1.26.16
|
||||
# -c requirements/base.txt
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
websocket-client==1.6.0
|
||||
websocket-client==1.6.1
|
||||
# via praw
|
||||
|
@ -4,7 +4,7 @@
|
||||
#
|
||||
# pip-compile requirements/ingest-s3.in
|
||||
#
|
||||
aiobotocore==2.5.0
|
||||
aiobotocore==2.5.1
|
||||
# via s3fs
|
||||
aiohttp==3.8.4
|
||||
# via
|
||||
@ -18,7 +18,7 @@ async-timeout==4.0.2
|
||||
# via aiohttp
|
||||
attrs==23.1.0
|
||||
# via aiohttp
|
||||
botocore==1.29.76
|
||||
botocore==1.29.161
|
||||
# via aiobotocore
|
||||
charset-normalizer==3.1.0
|
||||
# via
|
||||
@ -52,7 +52,7 @@ six==1.16.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# python-dateutil
|
||||
typing-extensions==4.6.3
|
||||
typing-extensions==4.7.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# aioitertools
|
||||
|
@ -1,3 +1,3 @@
|
||||
-c constraints.in
|
||||
-c base.txt
|
||||
unstructured-inference==0.5.1
|
||||
unstructured-inference==0.5.4
|
||||
|
@ -205,13 +205,13 @@ tqdm==4.65.0
|
||||
# transformers
|
||||
transformers==4.30.2
|
||||
# via unstructured-inference
|
||||
typing-extensions==4.6.3
|
||||
typing-extensions==4.7.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# huggingface-hub
|
||||
# iopath
|
||||
# torch
|
||||
unstructured-inference==0.5.1
|
||||
unstructured-inference==0.5.4
|
||||
# via -r requirements/local-inference.in
|
||||
urllib3==1.26.16
|
||||
# via
|
||||
|
@ -34,7 +34,7 @@ flake8==6.0.0
|
||||
# via -r requirements/test.in
|
||||
freezegun==1.2.2
|
||||
# via -r requirements/test.in
|
||||
grpcio==1.54.2
|
||||
grpcio==1.56.0
|
||||
# via -r requirements/test.in
|
||||
idna==3.4
|
||||
# via
|
||||
@ -43,7 +43,7 @@ idna==3.4
|
||||
# yarl
|
||||
iniconfig==2.0.0
|
||||
# via pytest
|
||||
label-studio-sdk==0.0.28
|
||||
label-studio-sdk==0.0.29
|
||||
# via -r requirements/test.in
|
||||
label-studio-tools==0.0.2
|
||||
# via label-studio-sdk
|
||||
@ -56,7 +56,7 @@ mccabe==0.7.0
|
||||
# via flake8
|
||||
multidict==6.0.4
|
||||
# via yarl
|
||||
mypy==1.4.0
|
||||
mypy==1.4.1
|
||||
# via -r requirements/test.in
|
||||
mypy-extensions==1.0.0
|
||||
# via
|
||||
@ -69,9 +69,9 @@ packaging==23.1
|
||||
# pytest
|
||||
pathspec==0.11.1
|
||||
# via black
|
||||
platformdirs==3.6.0
|
||||
platformdirs==3.8.0
|
||||
# via black
|
||||
pluggy==1.0.0
|
||||
pluggy==1.2.0
|
||||
# via pytest
|
||||
pycodestyle==2.10.0
|
||||
# via flake8
|
||||
@ -81,7 +81,7 @@ pydantic==1.10.9
|
||||
# label-studio-sdk
|
||||
pyflakes==3.0.1
|
||||
# via flake8
|
||||
pytest==7.3.2
|
||||
pytest==7.4.0
|
||||
# via
|
||||
# pytest-cov
|
||||
# pytest-mock
|
||||
@ -99,7 +99,7 @@ requests==2.31.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# label-studio-sdk
|
||||
ruff==0.0.273
|
||||
ruff==0.0.275
|
||||
# via -r requirements/test.in
|
||||
six==1.16.0
|
||||
# via
|
||||
@ -120,7 +120,7 @@ types-tabulate==0.9.0.2
|
||||
# via -r requirements/test.in
|
||||
types-urllib3==1.26.25.13
|
||||
# via types-requests
|
||||
typing-extensions==4.6.3
|
||||
typing-extensions==4.7.0
|
||||
# via
|
||||
# -c requirements/base.txt
|
||||
# black
|
||||
@ -132,7 +132,7 @@ urllib3==1.26.16
|
||||
# -c requirements/constraints.in
|
||||
# requests
|
||||
# vcrpy
|
||||
vcrpy==4.3.1
|
||||
vcrpy==5.0.0
|
||||
# via -r requirements/test.in
|
||||
wrapt==1.14.1
|
||||
# via
|
||||
|
@ -15,7 +15,6 @@ from unstructured.documents.elements import (
|
||||
ElementMetadata,
|
||||
ListItem,
|
||||
NarrativeText,
|
||||
PageBreak,
|
||||
Table,
|
||||
Text,
|
||||
Title,
|
||||
@ -438,7 +437,7 @@ def test_auto_partition_ppt_from_filename():
|
||||
def test_auto_with_page_breaks():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
|
||||
elements = partition(filename=filename, include_page_breaks=True, strategy="hi_res")
|
||||
assert any(isinstance(element, PageBreak) for element in elements)
|
||||
assert "PageBreak" in [elem.category for elem in elements]
|
||||
|
||||
|
||||
def test_auto_partition_epub_from_filename():
|
||||
|
@ -1,382 +1,132 @@
|
||||
[
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "833e3f9d4af02845c670c31e2d6d4f9a",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "a81f2feee790a4c2cf749889073d947a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Skills for Biomedical Data"
|
||||
"text": "Lisa Federer, MLIS, Data Science Training Coordinator"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "64b2134f054446d473fce1b05d4d4c94",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "c8e51fdc53c202393adad77f7f93ee5a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Maryam Zaringhalam, PhD, AAAS Science & Technology Policy Fellow"
|
||||
"text": "Executive Summary"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "f3416e4bccede2117fed6bc61910bc18",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "04ff84b51fab69c07381ac794b740243",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "F. Huerta, PhD, Associate Director of NLM for Program Development and NLM of Data Science and Open Science Initiatives"
|
||||
"text": "1. General biomedical subject matter knowledge: biomedical data scientists should have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "03382d8edd187c79918f58dabfe3efa9",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "a88d172116b68102146559d58fdb6669",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "This report provides recommendations for a minimal set of core skills for biomedical data scientists based on analysis that draws on opinions of data scientists, curricula for existing biomedical data science programs, and requirements for biomedical data science jobs."
|
||||
"text": "2. Programming language expertise: biomedical data scientists should be fluent in at"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "5f86ba4abc2e566faf03d08d68497fe3",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "f5f4c576951865bd016d4be673f624ff",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "have a general working knowledge of the principles of biology, bioinformatics, and basic clinical science;"
|
||||
"text": "least one programming language (typically R and/or Python);"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "a7933384fcbc1b05de9f42caa2a53259",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "8b02f539eb8ccee5b3fc24f66858188c",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Programming language expertise: biomedical data scientists should be fluent in at least one programming language (typically R and/or Python);"
|
||||
"text": "3. Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning emerged as especially important skills in biomedical data science;"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "574d13919f4ffba04df0cff8e3a96665",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "469e981f34d1e6f2b420574ed8e932d2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Predictive analytics, modeling, and machine learning: while a range of statistical methods may be useful, predictive analytics, modeling, and machine learning as especially important skills in biomedical data science;"
|
||||
"text": "4. Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be as important as the more technical skills typically associated with data science."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "2c2eabce11151dfe0837d521ad2bcc56",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "4b8fc76cbba0e2fef79ff8bc668b1401",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Team science and scientific communication: “soft” skills, like the ability to work well on teams and communicate effectively in both verbal and written venues, may be important as the more technical skills typically associated with data science."
|
||||
"text": "5. Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "186d6df0a7df1ef56ffd0aca24c8cb95",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "69da7754428f154ee3b2906214d31ad9",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Responsible data stewardship: a successful data scientist must be able to implement best practices for data management and stewardship, as well as conduct research in an ethical manner that maintains data security and privacy."
|
||||
"text": "The report further details specific skills and expertise relevant to biomedical data scientists."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "5e3d4670749a0f3753fa4bb1b328d156",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "37486ef32cbf05082d5dbff0581db762",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Training a biomedical data science (BDS) workforce is a central theme in NLM’s Strategic Plan for the coming decade. That commitment is echoed in the NIH-wide Big Data to Knowledge (BD2K) initiative, which invested $61 million between FY2014 and FY2017 in training programs for the development and use of biomedical big data science methods and tools. In line with"
|
||||
"text": "Motivation"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "c6a6ea3046a1368cce3761309c6fc20e",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "9fc51802fc970310e99a77b9f29af9ab",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "commitment, a recent report to the NLM Director recommended working across NIH to and develop core skills required of a biomedical data scientist to consistency across cohort of NIH-trained data scientists. This report provides a set of recommended core skills on analysis of current BD2K-funded training programs, biomedical data science job ads, practicing members of the current data science workforce."
|
||||
"text": "Core Skills for Biomedical Data Scientists _____________________________________________________________________________________________"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "f26d07e6b71e42596791a241e2417931",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "2e3cec7bff1e8c8d8e0087f0bcfa89f0",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "Methodology"
|
||||
"text": "a) Responses to a 2017 Kaggle1 survey2 of over 16,000 self-identified data scientists working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use."
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "bcefa2402c4d32dbf76a40451d0fc3dd",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "8c90963d8f3da9bf0465279550af369a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "The Workforce Excellence team took a three-pronged approach to identifying core skills required of a biomedical data scientist (BDS), drawing from:"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "f600a6418d465a7426c2277e80ad7201",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "working across many industries. Analysis of the Kaggle survey responses from the current data science workforce provided insights into the current generation of data scientists, including how they were trained and what programming and analysis skills they use. b) Data science skills taught in BD"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "07f54d1cb2e96bc062c55121de3f6882",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "24349c8054862cb8cbd4d857d096943e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "BD"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "caa5fc58a6d57578858155571d5d4f79",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "8b67c1eff9f0e59b2d8a11195bc13ce1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": ") statistics and math skills; ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "4b021f7187c84f22e863e931047e2fc2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": ") computer science; ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "59e087d0c9fcb1a8cc6d5448ce5fad04",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": ") subject knowledge; ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "ae19ecd18e97da5a942738ed9c37b235",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": ") general skills, like communication and teamwork. The coding schema is detailed in Appendix A. c) Desired skills identified from data science-related job ads."
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "d14cf7578b76bba89cd14f7c65d27dce",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "job ads from government ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "ff8f02c33b45fd488b21342ad816f985",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "%), academia ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "7b953c4510d51c8c49bdb1f72208e813",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "%), industry ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "b991274d798760827347db84d4c50aed",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "%), and the nonprofit sector ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "414960aea6ab87382923424b3cc49a05",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "73b8242ab49aacecd5561fc18ea23239",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "b) Data science skills taught in BD"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "07f54d1cb2e96bc062c55121de3f6882",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "24349c8054862cb8cbd4d857d096943e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "BD"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "caa5fc58a6d57578858155571d5d4f79",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "8b67c1eff9f0e59b2d8a11195bc13ce1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": ") statistics and math skills; ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "4b021f7187c84f22e863e931047e2fc2",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": ") computer science; ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "59e087d0c9fcb1a8cc6d5448ce5fad04",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": ") subject knowledge; ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
"element_id": "c865029d7025ef68891ec5c426b9aaa3",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": ") general skills, like communication and teamwork. The coding schema is detailed in Appendix A."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "844fd770568e8ee833454bfcc3a3340c",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "Data science skills taught in BD2K-funded training programs. A qualitative content analysis was applied to the descriptions of required courses offered under the 12 BD2K-funded training programs. Each course was coded using qualitative data analysis software, with each skill that was present in the description counted once. The coding schema of data science-related skills was inductively developed and was organized into four major categories: (1) statistics and math skills; (2) computer science; (3) subject knowledge; (4) general skills, like communication and teamwork. The coding schema is detailed in Appendix A."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "52eeeeac5a03bf69edb6126abb21f1d5",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "Desired skills identified from data science-related job ads. 59 job ads from government (8.5%), academia (42.4%), industry (33.9%), and the nonprofit sector (15.3%) were sampled from websites like Glassdoor, Linkedin, and Ziprecruiter. The content analysis methodology and coding schema utilized in analyzing the training programs were applied to the job descriptions. Because many job ads mentioned the same skill more than once, each occurrence of the skill was coded, therefore weighting important skills that were mentioned multiple times in a single ad."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "6a436d034b4636ebebdfee2765d3ac9e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "Analysis of the above data provided insights into the current state of biomedical data training, as well as a view into data science-related skills likely to be needed to prepare BDS workforce to succeed in the future. Together, these analyses informed for core skills necessary for a competitive biomedical data scientist."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "94ba2c5be803a3cb405fc51dada2532d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 2
|
||||
},
|
||||
"text": "2 In August 2017, Kaggle conducted an industry-wide survey to gain a clearer picture of the state of data science and machine learning. A standard set of questions were asked of all respondents, with more specific questions related to work for employed data scientists and questions related to learning for data scientists in training. Methodology and results: https://www.kaggle.com/kaggle/kaggle-survey-2017"
|
||||
"text": "2"
|
||||
}
|
||||
]
|
File diff suppressed because one or more lines are too long
@ -7,7 +7,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Instructions for Form 3115\n(Rev. November 1987)\n\nAnnlicatinn far Chance in Accounting Mathond\n"
|
||||
"text": "Instructions for Form 3115 (Rev. November 1987) Annlicatinn far Chance in Accounting Mathond"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -17,7 +17,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "(Section references are to the Internal Revenue Code unless otherwise noted.)\n"
|
||||
"text": "(Section references are to the Internal Revenue Code unless otherwise noted.)"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -27,7 +27,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Long-term contracts.—If you are required to\nchange your method of accounting for long-term\ncontracts under section"
|
||||
"text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -117,7 +117,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ", for the notification\nprocedures that must be followed.\n\nOther methods. —Unless the Service has\npublished a regulation or procedure to the\ncontrary, all other changes in accounting\nmethods required by the Act are automatically\nconsidered to be approved by the Commissioner.\nExamples of method changes automatically\napproved by the Commissioner are those changes\nrequired to effect: ("
|
||||
"text": ", for the notification procedures that must be followed. Other methods. —Unless the Service has published a regulation or procedure to the contrary, all other changes in accounting methods required by the Act are automatically considered to be approved by the Commissioner. Examples of method changes automatically approved by the Commissioner are those changes required to effect: ("
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -127,7 +127,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ") the repeal of the reserve\nmethod for bad debts of taxpayers other than\nfinancial institutions (Act section"
|
||||
"text": ") the repeal of the reserve method for bad debts of taxpayers other than financial institutions (Act section"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -147,7 +147,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ") the\nrepeal of the installment method for sales under\na revolving credit plan (Act section"
|
||||
"text": ") the repeal of the installment method for sales under a revolving credit plan (Act section"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -167,7 +167,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ") the\nInclusion of mcome attributable to the sale or\nfurnishing of utility services no later than the year\nin which the services were provided to customers\n(Act section"
|
||||
"text": ") the Inclusion of mcome attributable to the sale or furnishing of utility services no later than the year in which the services were provided to customers (Act section"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -187,7 +187,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": ") the repeal of the\ndeduction for qualified discount coupons (Act\nsection"
|
||||
"text": ") the repeal of the deduction for qualified discount coupons (Act section"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -207,7 +207,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "for these\nchanges.\n\nTb od Db bee Cl"
|
||||
"text": "for these changes. Tb od Db bee Cl"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -217,7 +217,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Long-term contracts.—If you are required to\nchange your method of accounting for long-term\ncontracts under section 460, see Notice 87-61\n(9/21/87), 1987-38 IRB 40, for the notification\nprocedures that must be followed\n"
|
||||
"text": "Long-term contracts.—If you are required to change your method of accounting for long-term contracts under section 460, see Notice 87-61 (9/21/87), 1987-38 IRB 40, for the notification procedures that must be followed"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -227,7 +227,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Paperwork Reduction Act Notice\n\nWe ack for thic infarenatinn te marry mye the.\n"
|
||||
"text": "Paperwork Reduction Act Notice We ack for thic infarenatinn te marry mye the."
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -237,7 +237,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Signature\ntea\n\n"
|
||||
"text": "Signature tea"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -247,7 +247,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Signature\n\nIndividuals.—An individual desiring the change\nshould sign the application. Ifthe application\npertains to a husband and wife filing a joint\nIncome tax return, the names of both should\nappear in the heading and both should sign\nPartnerships.—The form should be signed with\nthe partnership name followed by the signature\nof one of the general partners and the words\n“General Partner.”\nCorporations, cooperatives, and insurance\ncompanies.—The form should show the name of\nthe corporation, cooperative, or insurance\nCompany and the signature of the president, vice\npresident, treasurer, assistant treasurer, or chief\naccounting officer (such as tax officer) authorized\ntosign, and his or her official title. Receivers,\ntrustees, or assignees must sign any application\nthey are required to file, For a subsidiary\ncorporation filing a consolidated return with its\nparent, the form should be signed by an officer of\nthe parent corporation,\nFiduciaries.—The-form should show the name\nof the estate or trust and be signed by the\nfiduciary, personal representative, executor,\nexecutrix, administrator, administratrx, etc’,\nhaving legal authority to'sign, and his or her ttle.\nPreparer other than partner, officer, etc.—The\nsignature of the individual preparing the\napplication should appear in the space provided\non page"
|
||||
"text": "Signature Individuals.—An individual desiring the change should sign the application. Ifthe application pertains to a husband and wife filing a joint Income tax return, the names of both should appear in the heading and both should sign Partnerships.—The form should be signed with the partnership name followed by the signature of one of the general partners and the words “General Partner.” Corporations, cooperatives, and insurance companies.—The form should show the name of the corporation, cooperative, or insurance Company and the signature of the president, vice president, treasurer, assistant treasurer, or chief accounting officer (such as tax officer) authorized tosign, and his or her official title. Receivers, trustees, or assignees must sign any application they are required to file, For a subsidiary corporation filing a consolidated return with its parent, the form should be signed by an officer of the parent corporation, Fiduciaries.—The-form should show the name of the estate or trust and be signed by the fiduciary, personal representative, executor, executrix, administrator, administratrx, etc’, having legal authority to'sign, and his or her ttle. Preparer other than partner, officer, etc.—The signature of the individual preparing the application should appear in the space provided on page"
|
||||
},
|
||||
{
|
||||
"type": "ListItem",
|
||||
@ -267,7 +267,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "We ask for this information to carry out the\nInternal Revenue laws of the United States. We\nneed it to ensure that taxpayers are complying\nwith these laws an¢ to allow us to figure and\n\ncollect the right amount of tax. You are required\nto give us this information,\n"
|
||||
"text": "We ask for this information to carry out the Internal Revenue laws of the United States. We need it to ensure that taxpayers are complying with these laws an¢ to allow us to figure and collect the right amount of tax. You are required to give us this information,"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -277,7 +277,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "General Instructions\n\n"
|
||||
"text": "General Instructions"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -287,7 +287,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Purpose of Form\n\nCin bce Secon te cece cget.\n"
|
||||
"text": "Purpose of Form Cin bce Secon te cece cget."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -297,7 +297,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "ee\n\nFile this form to request a change in your\naccounting method, including the accounting\ntreatment of any item. if you are requesting 2\nchange in accounting period, use Form 1128,\nApplication for Change in Accounting Period. For\nmore information, see Publication 538,\nAccounting Periods and Methods,\n"
|
||||
"text": "ee File this form to request a change in your accounting method, including the accounting treatment of any item. if you are requesting 2 change in accounting period, use Form 1128, Application for Change in Accounting Period. For more information, see Publication 538, Accounting Periods and Methods,"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -307,7 +307,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Seti aes\n\nWhen filing Form 3115, taxpayers are\nreminded to determine if IRS has published a\nruling or procedure dealing with the specific type\nof change since November 1987 (the current.\nrevision date of Form 3115)\n\n"
|
||||
"text": "Seti aes When filing Form 3115, taxpayers are reminded to determine if IRS has published a ruling or procedure dealing with the specific type of change since November 1987 (the current. revision date of Form 3115)"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -317,7 +317,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "POMS SANE OPFOR DA 29).\nGenerally, applicants must complete Section\n\n‘A. In addition, complete the appropriate sections\n\n(B:1 through H) for which a change is desired.\n\nYou must give alll relevant facts, including a\n\n"
|
||||
"text": "POMS SANE OPFOR DA 29). Generally, applicants must complete Section ‘A. In addition, complete the appropriate sections (B:1 through H) for which a change is desired. You must give alll relevant facts, including a"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -327,7 +327,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Time and Place for Filing\namarall, ammlimeete maet file snete\n"
|
||||
"text": "Time and Place for Filing amarall, ammlimeete maet file snete"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -337,7 +337,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Se NB ON\n\nState whether you desire a conference in the\nNational Office if the Service proposes to\ndisapprove your application.\n\n"
|
||||
"text": "Se NB ON State whether you desire a conference in the National Office if the Service proposes to disapprove your application."
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -347,7 +347,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Affiliated Groups\n\nTavmayare that ara mam)\n\n"
|
||||
"text": "Affiliated Groups Tavmayare that ara mam)"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -357,7 +357,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Changes to Accounting Methods\nRequired Under the Tax Reform Act\nof 1986\n"
|
||||
"text": "Changes to Accounting Methods Required Under the Tax Reform Act of 1986"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -367,7 +367,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Uniform capitalization rules and limitation on\ncash method.—If you are required to char\n\n"
|
||||
"text": "Uniform capitalization rules and limitation on cash method.—If you are required to char"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -377,7 +377,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Specific Instructions\nSection A\n\nNeem Ea mama 1 !Taeahle inemes\n"
|
||||
"text": "Specific Instructions Section A Neem Ea mama 1 !Taeahle inemes"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -387,7 +387,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Late Applications\n\nMe coup armlimatinm te ler\n"
|
||||
"text": "Late Applications Me coup armlimatinm te ler"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -397,7 +397,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "lethal elaine bela\n\nDisregard the instructions under Time and\nPlace for Filing and Late Applications. instead,\nattach Form 3115 to your income tax return for\nthe year of change; do not file it separately. Also\ninclude on a separate statement accompanying\nthe Form 3115 the period over which the section\n481(2) adjustment will be taken into account and\nthe basis for that conclusion. Identify the\n"
|
||||
"text": "lethal elaine bela Disregard the instructions under Time and Place for Filing and Late Applications. instead, attach Form 3115 to your income tax return for the year of change; do not file it separately. Also include on a separate statement accompanying the Form 3115 the period over which the section 481(2) adjustment will be taken into account and the basis for that conclusion. Identify the"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
@ -407,7 +407,7 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Identifying Number\n\nNdiuidesale Am omptisoehesal\n"
|
||||
"text": "Identifying Number Ndiuidesale Am omptisoehesal"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
@ -417,6 +417,6 @@
|
||||
"filetype": "image/png",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "—e—e—— eee\nOthers.-—The employer identification number of\nan applicant other than an individual should be\nentered in this block,\n"
|
||||
"text": "—e—e—— eee Others.-—The employer identification number of an applicant other than an individual should be entered in this block,"
|
||||
}
|
||||
]
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,7 +1,27 @@
|
||||
[
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "cda1ae2f061dbdafb3374e6411d3823e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "S32"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "d7106f2241a37dc4e61314f45da1ff5b",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Poster Session I"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "e77987c7b17439bcfe8150c849de15a9",
|
||||
"element_id": "7ffd3b09cb23fc26ab2411d70e53838a",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -11,27 +31,17 @@
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "212ffda01df6b088ef8492dc27e5e461",
|
||||
"element_id": "d16d8a1280ba2acf52f98e9d3c9c2301",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "40 mg/day=3.6%, p<0.05; 80 mg/day=4.9%, p<0.01; 120 PM dosing group: 20 mg/day=-0.4%, ns; 40"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "386d01c3a035c53a261960e7553c898e",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "p<0.001, PM dosing group: 20 mg/day=-0.4%, ns; 40 mg/day=2.8%, p<0.05: 80 mg/day=0.2%, ns; 160 mg/day=5.8%, p<0.05). There was no clear dose-dependent trend associated with nausea and RD was similar between AM and PM dosing group (AM dosing group: 20 mg/ day=0.2% ns; 40 mg/day=3.8%, p<0.05; 80 mg/day=3.8%, ns; 120 mg/ day=6.6%, ns, PM dosing group: 20 mg/day=-1.6%, ns; 40 mg/day=-1.7%, ns; 80 mg/day=5.5%, p<0.01; 160 mg/day=2.8%, ns). Discussion: The risk of adverse events in the treatment of schizophrenia with lurasidone can vary depending on the timing of administration. In particular, for akathisia and somnolence, the incidence risks were reduced when lurasidone was administered in PM. Unlike with AM administration, the dose-dependence in the risks of these adverse events were not observed in lurasidone PM administration. The timing of lurasidone administration could be considered in effort to minimize potential adverse events."
|
||||
"text": "ns; 40 mg/day=3.6%, p<0.05; 80 mg/day=4.9%, p<0.01; 120 mg/day=9.3%, p<0.001, PM dosing group: 20 mg/day=-0.4%, ns; 40 mg/day=2.8%, p<0.05: 80 mg/day=0.2%, ns; 160 mg/day=5.8%, p<0.05). There was no clear dose-dependent trend associated with nausea and RD was similar between AM and PM dosing group (AM dosing group: 20 mg/ day=0.2% ns; 40 mg/day=3.8%, p<0.05; 80 mg/day=3.8%, ns; 120 mg/ day=6.6%, ns, PM dosing group: 20 mg/day=-1.6%, ns; 40 mg/day=-1.7%, ns; 80 mg/day=5.5%, p<0.01; 160 mg/day=2.8%, ns). Discussion: The risk of adverse events in the treatment of schizophrenia with lurasidone can vary depending on the timing of administration. In particular, for akathisia and somnolence, the incidence risks were reduced when lurasidone was administered in PM. Unlike with AM administration, the dose-dependence in the risks of these adverse events were not observed in lurasidone PM administration. The timing of lurasidone administration could be considered in effort to minimize potential adverse events."
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "15ef5407945d4d6b7863b5afaeb5ccb7",
|
||||
"element_id": "c02ccab64d2a356a96f5394a2b92fa0b",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -41,17 +51,17 @@
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "f840a64edde30227ef811444e3d98073",
|
||||
"element_id": "d981d6dfaa8794c0bb733db0965b2831",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1"
|
||||
"text": "Amedeo Minichino*1, Beata Godlewska1, Philip Cowen1, Philip Burnet1, Belinda Lennox1 1University of Oxford"
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "87729f38cab913c4c23019736f8609f0",
|
||||
"element_id": "0302f9e0f412cb4c63f13818e571c25c",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -61,97 +71,37 @@
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "856ef376d66a30635256534344601ff3",
|
||||
"element_id": "6164e852cb79f9408e833e350240ac5c",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Background: Meta-analytic evidence showed increased levels of periph- eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their"
|
||||
"text": "Background: Meta-analytic evidence showed increased levels of periph- eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their first non-affective psychotic episode (NA-FEP) and healthy controls. Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session. Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP Discussion: This study will help clarifying the contribution of peripheral endocannabinoid system to central brain mechanisms of key relevance for psychotic illness. It will also add further evidence on the limited literature on high-resolution characterisation of brain metabolites in early psychosis. Strengths of the study include: (i) use of high-field MRS, which allows the estimation of glutamate-related compounds at higher precision than at lower field strength; (ii) reduced heterogeneity of the clinical sample (only male and NA-FEP). Limitations: small sample size and cross-sectional design."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "fe6d77c57ae1de9b87f86c745241351a",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "18b532c1eceea81650bc4925582c44c1",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "eral endocannabinoid metabolites in psychotic illness. Alterations in the endocannabinoid system are believed to compromise glutamate and do- pamine transmission, which play a central role in pathophysiological models of psychosis. I will present preliminary data from an ongoing high-field proton magnetic resonance spectroscopy (MRS) study aimed at investigating the association between peripheral levels of endocannabinoid system metabolites and central glutamate metabolism in individuals at their first non-affective psychotic episode (NA-FEP) and healthy controls. Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session. Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP Discussion: This study will help clarifying the contribution of peripheral endocannabinoid system to central brain mechanisms of key relevance for psychotic illness. It will also add further evidence on the limited literature on high-resolution characterisation of brain metabolites in early psychosis. Strengths of the study include: (i) use of high-field MRS, which allows the estimation of glutamate-related compounds at higher precision than at lower field strength; (ii) reduced heterogeneity of the clinical sample (only male and NA-FEP). Limitations: small sample size and cross-sectional design."
|
||||
"text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3, Armando D’Agostino*3 1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "d258de4d2237cd8eec630531366f28c7",
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "8aa10e9d14227aadb36fe13b1086b431",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Anna Castelnovo1, Cecilia Casetta2, Francesco Donati3, Renata del Giudice3, Caroline Zangani3, Simone Sarasso3,"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "7bd18c510f2f36052d646353186ec9b9",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "1Faculty of Biomedical Sciences, Università della Svizzera Italiana, Switzerland; 2Institute of Psychiatry, Psychology and Neuroscience, King’s College London, England; 3Università degli Studi di Milano, Italy"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "8f3a9a89266b324f14a3581b118075ee",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Methods: We expect to recruit 17 NA-FEP and 20 healthy controls by January 2020. Currently, we recruited 12 NA-FEP and 18 healthy controls from two different research facilities (Imperial College London and University of Oxford) as part of a cross-sectional study. Participants un- derwent MRS scanning at 7-T with voxels placed in right dorsolateral prefrontal cortex (right-DLPFC), anterior cingulate cortex (ACC), and oc- cipital cortex. Neuro-metabolites will be calculated using the unsuppressed water signal as reference. Endocannabinoid metabolites were quantified from serum samples, collected during the same imaging session. Results: Analyses are ongoing. Based on previous evidence, expected"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "432b394cbfb1450398063f20cb7a7cdd",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "c7ef645a3ce73f2ef0479d20f26a47e0",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Results: Analyses are ongoing. Based on previous evidence, expected findings are: (i) reduced glutamate levels in the ACC and right-DLPFC of NA-FEP compared to controls; (ii) increased peripheral endocannabinoid metabolites in NA-FEP compared to controls; and (iii) inverse association between peripheral endocannabinoid metabolites and glutamate levels in ACC and right-DLPFC in NA-FEP"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "80c41b4abd6c3d2ad145cc4d726be8ec",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t"
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "cd6ce42dd231c70217aa183b1df8fc63",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC."
|
||||
"text": "Background: Slow waves, the hallmark of the deep nonrapid eye move- ment sleep electroencephalogram (EEG), are critical for restorative sleep and brain plasticity. They arise from the synchronous depolarization and hyperpolarization of millions of cortical neurons and their proper gen- eration and propagation relies upon the integrity of widespread cortico- thalamic networks. Slow wave abnormalities have been reported in patient with Schizophrenia, although with partially contradictory results, probably related to antipsychotic and sedative medications. Recently, their presence and delineation, have been convincingly shown in first-episode psychosis patients (FEP). However, clear evidence of this biomarker at the onset of the disease, prior to any psychopharmacological intervention, remains limited. Moreover, no attempt has been made to elucidate the prognostic meaning of this finding. Methods: We collected whole night sleep high–density electroencephalog- raphy recordings (64-channel BrainAmp, Brain Products GmbH, Gilching, Germany) in 20 drug-naive FEP patients and 20 healthy control subjects (HC). Several clinical psychometric scales as well as neurocognitive tests were administered to all subjects in order to better define psychopatholog- ical status and vulnerability. EEG slow wave activity (SWA, spectral power between 1 and 4 Hz) and several slow wave parameters were computed at each electrode location, including density and amplitude, at each electrode location. Along with a group analysis between FEP and HC, a subgroup analysis was also computed between patients who showed a progression of symptoms to full-blown Schizophrenia (SCZ, n = 10) over the next 12-month follow-up and those who did not (OTH, n = 10). Results: Sleep macro-architecture was globally preserved in FEP patients. SWA (1–4 Hz) was lower in FEP compared to HC but this difference didn’t reach statistical significance. Slow wave density was decreased in FEP compared to HC, with a significance that survived multiple comparison correction over a large fronto-central cluster. Mean amplitude was pre- served. At the subgroup analysis, these results were largely driven by the subgroup of patients with a confirmed diagnosis of SCZ at a 12-month fol- low-up. Indeed, no difference could be found between OTH and HC, while a strong significance was still evident between SCZ and HC."
|
||||
},
|
||||
{
|
||||
"type": "Title",
|
||||
"element_id": "5807689818fb4d4d7e7112bae842b50c",
|
||||
"element_id": "80abb04ec613b1d325ce6b8d0bb3349d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
@ -161,12 +111,32 @@
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
"element_id": "59aa170fed2f7a1ab36e75f0cd0461c4",
|
||||
"element_id": "3f834ac0bf8b0dbd8d64ee065820467f",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "Camila Loureiro*1, Corsi-Zuelli Fabiana1, Fachim Helene Aparecida1, Shuhama Rosana1, Menezes Paulo Rossi1, Dalton Caroline F2,"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "117f7774fd093a60d964cc5b461f3e22",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "AQ3"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "44b59a545030365cd1ad225ed05ff22d",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "application/pdf",
|
||||
"page_number": 1
|
||||
},
|
||||
"text": "SIRS 2020 Abstracts"
|
||||
}
|
||||
]
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1 +1 @@
|
||||
__version__ = "0.7.11-dev2" # pragma: no cover
|
||||
__version__ = "0.7.11" # pragma: no cover
|
||||
|
@ -443,23 +443,25 @@ def _check_eml_from_buffer(file: IO) -> bool:
|
||||
def document_to_element_list(
|
||||
document: "DocumentLayout",
|
||||
include_page_breaks: bool = False,
|
||||
sort: bool = False,
|
||||
) -> List[Element]:
|
||||
"""Converts a DocumentLayout object to a list of unstructured elements."""
|
||||
elements: List[Element] = []
|
||||
num_pages = len(document.pages)
|
||||
for i, page in enumerate(document.pages):
|
||||
page_elements: List[Element] = []
|
||||
for layout_element in page.elements:
|
||||
element = normalize_layout_element(layout_element)
|
||||
if isinstance(element, List):
|
||||
for el in element:
|
||||
el.metadata.page_number = i + 1
|
||||
elements.extend(element)
|
||||
page_elements.extend(element)
|
||||
continue
|
||||
else:
|
||||
element.metadata.text_as_html = (
|
||||
layout_element.text_as_html if hasattr(layout_element, "text_as_html") else None
|
||||
)
|
||||
elements.append(element)
|
||||
page_elements.append(element)
|
||||
if hasattr(page, "image"):
|
||||
image_format = page.image.format
|
||||
coordinate_system = PixelSpace(width=page.image.width, height=page.image.height)
|
||||
@ -468,8 +470,18 @@ def document_to_element_list(
|
||||
coordinate_system = None
|
||||
element._coordinate_system = coordinate_system
|
||||
_add_element_metadata(element, page_number=i + 1, filetype=image_format)
|
||||
if sort:
|
||||
page_elements = sorted(
|
||||
page_elements,
|
||||
key=lambda el: (
|
||||
el.coordinates[0][1] if el.coordinates else float("inf"),
|
||||
el.coordinates[0][0] if el.coordinates else float("inf"),
|
||||
el.id,
|
||||
),
|
||||
)
|
||||
if include_page_breaks and i < num_pages - 1:
|
||||
elements.append(PageBreak(text=""))
|
||||
page_elements.append(PageBreak(text=""))
|
||||
elements.extend(page_elements)
|
||||
|
||||
return elements
|
||||
|
||||
|
@ -5,17 +5,19 @@ from tempfile import SpooledTemporaryFile
|
||||
from typing import BinaryIO, List, Optional, Union, cast
|
||||
|
||||
import pdf2image
|
||||
import PIL
|
||||
from pdfminer.high_level import extract_pages
|
||||
from pdfminer.layout import LTContainer, LTImage, LTItem, LTTextBox
|
||||
from pdfminer.utils import open_filename
|
||||
from PIL import Image
|
||||
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
from unstructured.documents.coordinates import PixelSpace
|
||||
from unstructured.documents.elements import (
|
||||
Element,
|
||||
ElementMetadata,
|
||||
Image,
|
||||
PageBreak,
|
||||
Text,
|
||||
process_metadata,
|
||||
)
|
||||
from unstructured.file_utils.filetype import (
|
||||
@ -32,6 +34,8 @@ from unstructured.partition.strategies import determine_pdf_or_image_strategy
|
||||
from unstructured.partition.text import element_from_text, partition_text
|
||||
from unstructured.utils import requires_dependencies
|
||||
|
||||
RE_MULTISPACE_INCLUDING_NEWLINES = re.compile(pattern=r"\s+", flags=re.DOTALL)
|
||||
|
||||
|
||||
@process_metadata()
|
||||
@add_metadata_with_filetype(FileType.PDF)
|
||||
@ -192,8 +196,23 @@ def _partition_pdf_or_image_local(
|
||||
extract_tables=infer_table_structure,
|
||||
model_name=model_name,
|
||||
)
|
||||
elements = document_to_element_list(layout, include_page_breaks=include_page_breaks, sort=False)
|
||||
out_elements = []
|
||||
|
||||
return document_to_element_list(layout, include_page_breaks=include_page_breaks)
|
||||
for el in elements:
|
||||
if (isinstance(el, PageBreak) and not include_page_breaks) or (
|
||||
# NOTE(crag): small chunks of text from Image elements tend to be garbage
|
||||
isinstance(el, Image)
|
||||
and (el.text is None or len(el.text) < 24 or el.text.find(" ") == -1)
|
||||
):
|
||||
continue
|
||||
# NOTE(crag): this is probably always a Text object, but check for the sake of typing
|
||||
if isinstance(el, Text):
|
||||
el.text = re.sub(RE_MULTISPACE_INCLUDING_NEWLINES, " ", el.text or "").strip()
|
||||
if el.text or isinstance(el, PageBreak):
|
||||
out_elements.append(cast(Element, el))
|
||||
|
||||
return out_elements
|
||||
|
||||
|
||||
@requires_dependencies("pdfminer", "local-inference")
|
||||
@ -293,6 +312,7 @@ def _process_pdfminer_pages(
|
||||
key=lambda el: (
|
||||
el.coordinates[0][1] if el.coordinates else float("inf"),
|
||||
el.coordinates[0][0] if el.coordinates else float("inf"),
|
||||
el.id,
|
||||
),
|
||||
)
|
||||
elements += sorted_page_elements
|
||||
@ -318,7 +338,7 @@ def _partition_pdf_or_image_with_ocr(
|
||||
|
||||
if is_image:
|
||||
if file is not None:
|
||||
image = Image.open(file)
|
||||
image = PIL.Image.open(file)
|
||||
text = pytesseract.image_to_string(image, config=f"-l '{ocr_languages}'")
|
||||
else:
|
||||
text = pytesseract.image_to_string(filename, config=f"-l '{ocr_languages}'")
|
||||
|
Loading…
x
Reference in New Issue
Block a user