feat: add ability to pass headers in partition_html (#397)

Also adds pytest-mock requirement, those fixtures are nice to have!

Implements issue/feature #396 .
This commit is contained in:
cragwolfe 2023-03-23 20:14:57 -07:00 committed by GitHub
parent a4394f6f16
commit ce9fc26009
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 247 additions and 147 deletions

View File

@ -1,8 +1,9 @@
## 0.5.7-dev2
## 0.5.7-dev3
### Enhancements
* Refactored codebase using `exactly_one`
* Adds ability to pass headers when passing a url in partition_html()
### Features

View File

@ -8,7 +8,7 @@ alabaster==0.7.13
# via sphinx
babel==2.12.1
# via sphinx
beautifulsoup4==4.11.2
beautifulsoup4==4.12.0
# via furo
certifi==2022.12.7
# via
@ -20,13 +20,13 @@ docutils==0.18.1
# via
# sphinx
# sphinx-rtd-theme
furo==2022.12.7
furo==2023.3.23
# via -r requirements/build.in
idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via sphinx
jinja2==3.1.2
# via sphinx
@ -52,6 +52,7 @@ sphinx==6.1.3
# furo
# sphinx-basic-ng
# sphinx-rtd-theme
# sphinxcontrib-jquery
sphinx-basic-ng==1.0.0b1
# via furo
sphinx-rtd-theme==1.2.0rc3
@ -62,7 +63,7 @@ sphinxcontrib-devhelp==1.0.2
# via sphinx
sphinxcontrib-htmlhelp==2.0.1
# via sphinx
sphinxcontrib-jquery==3.0.0
sphinxcontrib-jquery==4.1
# via sphinx-rtd-theme
sphinxcontrib-jsmath==1.0.1
# via sphinx
@ -70,10 +71,7 @@ sphinxcontrib-qthelp==1.0.3
# via sphinx
sphinxcontrib-serializinghtml==1.1.5
# via sphinx
urllib3==1.26.14
urllib3==1.26.15
# via requests
zipp==3.15.0
# via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:
# setuptools

View File

@ -210,10 +210,13 @@ Examples:
The ``partition_html`` function partitions an HTML document and returns a list
of document ``Element`` objects. ``partition_html`` can take a filename, file-like
object, or string as input. The three examples below all produce the same output.
object, string, or url as input.
Examples:
These three invocations of partition_html() result are essentially equivalent:
.. code:: python
from unstructured.partition.html import partition_html
@ -228,6 +231,22 @@ Examples:
elements = partition_html(text=text)
The following illustrates fetching a url and partition it the response content.
.. code:: python
from unstructured.partition.html import partition_html
elements = partition_html(url="https://python.org/")
# you can also provide custom headers:
elements = partition_html(url="https://python.org/",
headers={"User-Agent": "YourScriptName/1.0 ..."})
``partition_pdf``
---------------------

View File

@ -65,7 +65,7 @@ def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict:
response.raise_for_status()
content = json.loads(response.content)
recent_forms = content["filings"]["recent"]
form_types = {k: v for k, v in zip(recent_forms["accessionNumber"], recent_forms["form"])}
form_types = dict(zip(recent_forms["accessionNumber"], recent_forms["form"]))
return form_types

View File

@ -4,12 +4,9 @@
#
# pip-compile --output-file=requirements/base.txt
#
--extra-index-url https://pypi.ngc.nvidia.com
--trusted-host pypi.ngc.nvidia.com
anyio==3.6.2
# via httpcore
argilla==1.4.0
argilla==1.5.0
# via unstructured (setup.py)
backoff==2.2.1
# via argilla
@ -40,7 +37,7 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via markdown
joblib==1.2.0
# via nltk
@ -49,7 +46,7 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
markdown==3.4.3
# via unstructured (setup.py)
monotonic==1.6
# via argilla
@ -59,7 +56,7 @@ numpy==1.23.5
# via
# argilla
# pandas
openpyxl==3.1.1
openpyxl==3.1.2
# via unstructured (setup.py)
packaging==23.0
# via argilla
@ -71,7 +68,7 @@ pillow==9.4.0
# via
# python-pptx
# unstructured (setup.py)
pydantic==1.10.6
pydantic==1.10.7
# via argilla
pygments==2.14.0
# via rich
@ -87,7 +84,7 @@ python-pptx==0.6.21
# via unstructured (setup.py)
pytz==2022.7.1
# via pandas
regex==2022.10.31
regex==2023.3.23
# via nltk
requests==2.28.2
# via unstructured (setup.py)
@ -110,7 +107,7 @@ typing-extensions==4.5.0
# via
# pydantic
# rich
urllib3==1.26.14
urllib3==1.26.15
# via requests
wrapt==1.14.1
# via

View File

@ -8,7 +8,7 @@ alabaster==0.7.13
# via sphinx
babel==2.12.1
# via sphinx
beautifulsoup4==4.11.2
beautifulsoup4==4.12.0
# via furo
certifi==2022.12.7
# via
@ -20,13 +20,13 @@ docutils==0.18.1
# via
# sphinx
# sphinx-rtd-theme
furo==2022.12.7
furo==2023.3.23
# via -r requirements/build.in
idna==3.4
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via sphinx
jinja2==3.1.2
# via sphinx
@ -52,6 +52,7 @@ sphinx==6.1.3
# furo
# sphinx-basic-ng
# sphinx-rtd-theme
# sphinxcontrib-jquery
sphinx-basic-ng==1.0.0b1
# via furo
sphinx-rtd-theme==1.2.0rc3
@ -62,7 +63,7 @@ sphinxcontrib-devhelp==1.0.2
# via sphinx
sphinxcontrib-htmlhelp==2.0.1
# via sphinx
sphinxcontrib-jquery==3.0.0
sphinxcontrib-jquery==4.1
# via sphinx-rtd-theme
sphinxcontrib-jsmath==1.0.1
# via sphinx
@ -70,10 +71,7 @@ sphinxcontrib-qthelp==1.0.3
# via sphinx
sphinxcontrib-serializinghtml==1.1.5
# via sphinx
urllib3==1.26.14
urllib3==1.26.15
# via requests
zipp==3.15.0
# via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:
# setuptools

View File

@ -25,7 +25,7 @@ attrs==22.2.0
# via jsonschema
backcall==0.2.0
# via ipython
beautifulsoup4==4.11.2
beautifulsoup4==4.12.0
# via nbconvert
bleach==6.0.0
# via nbconvert
@ -37,7 +37,7 @@ cfgv==3.3.1
# via pre-commit
click==8.1.3
# via pip-tools
comm==0.1.2
comm==0.1.3
# via ipykernel
debugpy==1.6.6
# via ipykernel
@ -51,25 +51,24 @@ executing==1.2.0
# via stack-data
fastjsonschema==2.16.3
# via nbformat
filelock==3.9.0
filelock==3.10.3
# via virtualenv
fqdn==1.5.1
# via jsonschema
identify==2.5.19
identify==2.5.21
# via pre-commit
idna==3.4
# via
# anyio
# jsonschema
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via
# jupyter-client
# nbconvert
importlib-resources==5.12.0
# via jsonschema
ipykernel==6.21.3
ipykernel==6.22.0
# via
# ipywidgets
# jupyter
# jupyter-console
# nbclassic
@ -86,7 +85,7 @@ ipython-genutils==0.2.0
# nbclassic
# notebook
# qtconsole
ipywidgets==8.0.4
ipywidgets==8.0.5
# via jupyter
isoduration==20.11.0
# via jsonschema
@ -106,7 +105,7 @@ jsonschema[format-nongpl]==4.17.3
# nbformat
jupyter==1.0.0
# via -r requirements/dev.in
jupyter-client==8.0.3
jupyter-client==8.1.0
# via
# ipykernel
# jupyter-console
@ -117,7 +116,7 @@ jupyter-client==8.0.3
# qtconsole
jupyter-console==6.6.3
# via jupyter
jupyter-core==5.2.0
jupyter-core==5.3.0
# via
# -r requirements/dev.in
# ipykernel
@ -132,7 +131,7 @@ jupyter-core==5.2.0
# qtconsole
jupyter-events==0.6.3
# via jupyter-server
jupyter-server==2.4.0
jupyter-server==2.5.0
# via
# nbclassic
# notebook-shim
@ -140,7 +139,7 @@ jupyter-server-terminals==0.4.4
# via jupyter-server
jupyterlab-pygments==0.2.2
# via nbconvert
jupyterlab-widgets==3.0.5
jupyterlab-widgets==3.0.6
# via ipywidgets
markupsafe==2.1.2
# via
@ -156,13 +155,13 @@ nbclassic==0.5.3
# via notebook
nbclient==0.7.2
# via nbconvert
nbconvert==7.2.9
nbconvert==7.2.10
# via
# jupyter
# jupyter-server
# nbclassic
# notebook
nbformat==5.7.3
nbformat==5.8.0
# via
# jupyter-server
# nbclassic
@ -186,6 +185,7 @@ packaging==23.0
# ipykernel
# jupyter-server
# nbconvert
# qtconsole
# qtpy
pandocfilters==1.5.0
# via nbconvert
@ -203,7 +203,7 @@ platformdirs==3.1.1
# via
# jupyter-core
# virtualenv
pre-commit==3.1.1
pre-commit==3.2.0
# via -r requirements/dev.in
prometheus-client==0.16.0
# via
@ -244,7 +244,7 @@ pyyaml==6.0
# via
# jupyter-events
# pre-commit
pyzmq==25.0.0
pyzmq==25.0.2
# via
# ipykernel
# jupyter-client
@ -253,7 +253,7 @@ pyzmq==25.0.0
# nbclassic
# notebook
# qtconsole
qtconsole==5.4.0
qtconsole==5.4.1
# via jupyter
qtpy==2.3.0
# via qtconsole
@ -322,7 +322,7 @@ traitlets==5.9.0
# qtconsole
uri-template==1.2.0
# via jsonschema
virtualenv==20.20.0
virtualenv==20.21.0
# via pre-commit
wcwidth==0.2.6
# via prompt-toolkit
@ -334,11 +334,11 @@ webencodings==0.5.1
# tinycss2
websocket-client==1.5.1
# via jupyter-server
wheel==0.38.4
wheel==0.40.0
# via
# -r requirements/dev.in
# pip-tools
widgetsnbextension==4.0.5
widgetsnbextension==4.0.6
# via ipywidgets
zipp==3.15.0
# via

View File

@ -6,7 +6,7 @@
#
anyio==3.6.2
# via httpcore
argilla==1.4.0
argilla==1.5.0
# via unstructured (setup.py)
backoff==2.2.1
# via argilla
@ -28,9 +28,10 @@ deprecated==1.2.13
# via argilla
et-xmlfile==1.1.0
# via openpyxl
filelock==3.9.0
filelock==3.10.3
# via
# huggingface-hub
# torch
# transformers
h11==0.14.0
# via httpcore
@ -38,15 +39,17 @@ httpcore==0.16.3
# via httpx
httpx==0.23.3
# via argilla
huggingface-hub==0.13.1
huggingface-hub==0.13.3
# via transformers
idna==3.4
# via
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via markdown
jinja2==3.1.2
# via torch
joblib==1.2.0
# via
# nltk
@ -58,10 +61,16 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
markdown==3.4.3
# via unstructured (setup.py)
markupsafe==2.1.2
# via jinja2
monotonic==1.6
# via argilla
mpmath==1.3.0
# via sympy
networkx==3.0
# via torch
nltk==3.8.1
# via unstructured (setup.py)
numpy==1.23.5
@ -69,7 +78,7 @@ numpy==1.23.5
# argilla
# pandas
# transformers
openpyxl==3.1.1
openpyxl==3.1.2
# via unstructured (setup.py)
packaging==23.0
# via
@ -84,10 +93,12 @@ pillow==9.4.0
# via
# python-pptx
# unstructured (setup.py)
pydantic==1.10.6
pydantic==1.10.7
# via argilla
pygments==2.14.0
# via rich
pypandoc==1.11
# via unstructured (setup.py)
python-dateutil==2.8.2
# via pandas
python-docx==0.8.11
@ -102,7 +113,7 @@ pyyaml==6.0
# via
# huggingface-hub
# transformers
regex==2022.10.31
regex==2023.3.23
# via
# nltk
# sacremoses
@ -130,9 +141,11 @@ sniffio==1.3.0
# anyio
# httpcore
# httpx
sympy==1.11.1
# via torch
tokenizers==0.13.2
# via transformers
torch==1.13.1
torch==2.0.0
# via unstructured (setup.py)
tqdm==4.65.0
# via
@ -141,7 +154,7 @@ tqdm==4.65.0
# nltk
# sacremoses
# transformers
transformers==4.26.1
transformers==4.27.3
# via unstructured (setup.py)
typing-extensions==4.5.0
# via
@ -149,7 +162,7 @@ typing-extensions==4.5.0
# pydantic
# rich
# torch
urllib3==1.26.14
urllib3==1.26.15
# via requests
wrapt==1.14.1
# via

View File

@ -16,7 +16,7 @@ anyio==3.6.2
# via
# -r requirements/base.txt
# httpcore
argilla==1.3.1
argilla==1.5.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -50,7 +50,7 @@ cffi==1.15.1
# via
# azure-datalake-store
# cryptography
charset-normalizer==3.0.1
charset-normalizer==3.1.0
# via
# -r requirements/base.txt
# aiohttp
@ -59,7 +59,11 @@ click==8.1.3
# via
# -r requirements/base.txt
# nltk
cryptography==39.0.1
commonmark==0.9.1
# via
# -r requirements/base.txt
# rich
cryptography==39.0.2
# via
# adal
# azure-identity
@ -78,7 +82,7 @@ frozenlist==1.3.3
# via
# aiohttp
# aiosignal
fsspec==2023.1.0
fsspec==2023.3.0
# via
# adlfs
# unstructured (setup.py)
@ -101,7 +105,7 @@ idna==3.4
# requests
# rfc3986
# yarl
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via
# -r requirements/base.txt
# markdown
@ -117,7 +121,7 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
markdown==3.4.3
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -144,7 +148,7 @@ numpy==1.23.5
# -r requirements/base.txt
# argilla
# pandas
openpyxl==3.1.1
openpyxl==3.1.2
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -166,14 +170,22 @@ portalocker==2.7.0
# via msal-extensions
pycparser==2.21
# via cffi
pydantic==1.10.5
pydantic==1.10.7
# via
# -r requirements/base.txt
# argilla
pygments==2.14.0
# via
# -r requirements/base.txt
# rich
pyjwt[crypto]==2.6.0
# via
# adal
# msal
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2
# via
# -r requirements/base.txt
@ -195,7 +207,7 @@ pytz==2022.7.1
# via
# -r requirements/base.txt
# pandas
regex==2022.10.31
regex==2023.3.23
# via
# -r requirements/base.txt
# nltk
@ -211,6 +223,10 @@ rfc3986[idna2008]==1.5.0
# via
# -r requirements/base.txt
# httpx
rich==13.0.1
# via
# -r requirements/base.txt
# argilla
six==1.16.0
# via
# -r requirements/base.txt
@ -224,7 +240,7 @@ sniffio==1.3.0
# anyio
# httpcore
# httpx
tqdm==4.64.1
tqdm==4.65.0
# via
# -r requirements/base.txt
# argilla
@ -235,7 +251,8 @@ typing-extensions==4.5.0
# azure-core
# azure-storage-blob
# pydantic
urllib3==1.26.14
# rich
urllib3==1.26.15
# via
# -r requirements/base.txt
# requests
@ -244,7 +261,7 @@ wrapt==1.14.1
# -r requirements/base.txt
# argilla
# deprecated
xlsxwriter==3.0.8
xlsxwriter==3.0.9
# via
# -r requirements/base.txt
# python-pptx

View File

@ -8,7 +8,7 @@ anyio==3.6.2
# via
# -r requirements/base.txt
# httpcore
argilla==1.4.0
argilla==1.5.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -64,7 +64,7 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via
# -r requirements/base.txt
# markdown
@ -78,7 +78,7 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
markdown==3.4.3
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -95,7 +95,7 @@ numpy==1.23.5
# -r requirements/base.txt
# argilla
# pandas
openpyxl==3.1.1
openpyxl==3.1.2
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -115,7 +115,7 @@ pillow==9.4.0
# unstructured (setup.py)
pycparser==2.21
# via cffi
pydantic==1.10.6
pydantic==1.10.7
# via
# -r requirements/base.txt
# argilla
@ -129,6 +129,10 @@ pyjwt==2.6.0
# via pygithub
pynacl==1.5.0
# via pygithub
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2
# via
# -r requirements/base.txt
@ -149,7 +153,7 @@ pytz==2022.7.1
# via
# -r requirements/base.txt
# pandas
regex==2022.10.31
regex==2023.3.23
# via
# -r requirements/base.txt
# nltk
@ -186,7 +190,7 @@ typing-extensions==4.5.0
# -r requirements/base.txt
# pydantic
# rich
urllib3==1.26.14
urllib3==1.26.15
# via
# -r requirements/base.txt
# requests

View File

@ -8,7 +8,7 @@ anyio==3.6.2
# via
# -r requirements/base.txt
# httpcore
argilla==1.4.0
argilla==1.5.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -61,7 +61,7 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via
# -r requirements/base.txt
# markdown
@ -75,7 +75,7 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
markdown==3.4.3
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -92,7 +92,7 @@ numpy==1.23.5
# -r requirements/base.txt
# argilla
# pandas
openpyxl==3.1.1
openpyxl==3.1.2
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -110,7 +110,7 @@ pillow==9.4.0
# -r requirements/base.txt
# python-pptx
# unstructured (setup.py)
pydantic==1.10.6
pydantic==1.10.7
# via
# -r requirements/base.txt
# argilla
@ -118,6 +118,10 @@ pygments==2.14.0
# via
# -r requirements/base.txt
# rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2
# via
# -r requirements/base.txt
@ -140,7 +144,7 @@ pytz==2022.7.1
# via
# -r requirements/base.txt
# pandas
regex==2022.10.31
regex==2023.3.23
# via
# -r requirements/base.txt
# nltk
@ -180,7 +184,7 @@ typing-extensions==4.5.0
# -r requirements/base.txt
# pydantic
# rich
urllib3==1.26.14
urllib3==1.26.15
# via
# -r requirements/base.txt
# requests

View File

@ -8,7 +8,7 @@ anyio==3.6.2
# via
# -r requirements/base.txt
# httpcore
argilla==1.4.0
argilla==1.5.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -47,7 +47,7 @@ et-xmlfile==1.1.0
# openpyxl
google-api-core==2.11.0
# via google-api-python-client
google-api-python-client==2.80.0
google-api-python-client==2.82.0
# via unstructured (setup.py)
google-auth==2.16.2
# via
@ -56,7 +56,7 @@ google-auth==2.16.2
# google-auth-httplib2
google-auth-httplib2==0.1.0
# via google-api-python-client
googleapis-common-protos==1.58.0
googleapis-common-protos==1.59.0
# via google-api-core
h11==0.14.0
# via
@ -66,7 +66,7 @@ httpcore==0.16.3
# via
# -r requirements/base.txt
# httpx
httplib2==0.21.0
httplib2==0.22.0
# via
# google-api-python-client
# google-auth-httplib2
@ -80,7 +80,7 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via
# -r requirements/base.txt
# markdown
@ -94,7 +94,7 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
markdown==3.4.3
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -111,7 +111,7 @@ numpy==1.23.5
# -r requirements/base.txt
# argilla
# pandas
openpyxl==3.1.1
openpyxl==3.1.2
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -139,7 +139,7 @@ pyasn1==0.4.8
# rsa
pyasn1-modules==0.2.8
# via google-auth
pydantic==1.10.6
pydantic==1.10.7
# via
# -r requirements/base.txt
# argilla
@ -147,6 +147,10 @@ pygments==2.14.0
# via
# -r requirements/base.txt
# rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
pyparsing==3.0.9
# via httplib2
python-dateutil==2.8.2
@ -169,7 +173,7 @@ pytz==2022.7.1
# via
# -r requirements/base.txt
# pandas
regex==2022.10.31
regex==2023.3.23
# via
# -r requirements/base.txt
# nltk
@ -212,7 +216,7 @@ typing-extensions==4.5.0
# rich
uritemplate==4.1.1
# via google-api-python-client
urllib3==1.26.14
urllib3==1.26.15
# via
# -r requirements/base.txt
# requests

View File

@ -8,7 +8,7 @@ anyio==3.6.2
# via
# -r requirements/base.txt
# httpcore
argilla==1.4.0
argilla==1.5.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -61,7 +61,7 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via
# -r requirements/base.txt
# markdown
@ -75,7 +75,7 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
markdown==3.4.3
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -92,7 +92,7 @@ numpy==1.23.5
# -r requirements/base.txt
# argilla
# pandas
openpyxl==3.1.1
openpyxl==3.1.2
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -114,7 +114,7 @@ praw==7.7.0
# via unstructured (setup.py)
prawcore==2.3.0
# via praw
pydantic==1.10.6
pydantic==1.10.7
# via
# -r requirements/base.txt
# argilla
@ -122,6 +122,10 @@ pygments==2.14.0
# via
# -r requirements/base.txt
# rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2
# via
# -r requirements/base.txt
@ -142,7 +146,7 @@ pytz==2022.7.1
# via
# -r requirements/base.txt
# pandas
regex==2022.10.31
regex==2023.3.23
# via
# -r requirements/base.txt
# nltk
@ -182,7 +186,7 @@ typing-extensions==4.5.0
# rich
update-checker==0.18.0
# via praw
urllib3==1.26.14
urllib3==1.26.15
# via
# -r requirements/base.txt
# requests

View File

@ -18,7 +18,7 @@ anyio==3.6.2
# via
# -r requirements/base.txt
# httpcore
argilla==1.4.0
argilla==1.5.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -87,7 +87,7 @@ idna==3.4
# requests
# rfc3986
# yarl
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via
# -r requirements/base.txt
# markdown
@ -103,7 +103,7 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
markdown==3.4.3
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -124,7 +124,7 @@ numpy==1.23.5
# -r requirements/base.txt
# argilla
# pandas
openpyxl==3.1.1
openpyxl==3.1.2
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -142,7 +142,7 @@ pillow==9.4.0
# -r requirements/base.txt
# python-pptx
# unstructured (setup.py)
pydantic==1.10.6
pydantic==1.10.7
# via
# -r requirements/base.txt
# argilla
@ -150,6 +150,10 @@ pygments==2.14.0
# via
# -r requirements/base.txt
# rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2
# via
# -r requirements/base.txt
@ -171,7 +175,7 @@ pytz==2022.7.1
# via
# -r requirements/base.txt
# pandas
regex==2022.10.31
regex==2023.3.23
# via
# -r requirements/base.txt
# nltk
@ -210,7 +214,7 @@ typing-extensions==4.5.0
# aioitertools
# pydantic
# rich
urllib3==1.26.14
urllib3==1.26.15
# via
# -r requirements/base.txt
# botocore

View File

@ -8,7 +8,7 @@ anyio==3.6.2
# via
# -r requirements/base.txt
# httpcore
argilla==1.4.0
argilla==1.5.0
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -16,7 +16,7 @@ backoff==2.2.1
# via
# -r requirements/base.txt
# argilla
beautifulsoup4==4.11.2
beautifulsoup4==4.12.0
# via wikipedia
certifi==2022.12.7
# via
@ -63,7 +63,7 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via
# -r requirements/base.txt
# markdown
@ -77,7 +77,7 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
markdown==3.4.3
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -94,7 +94,7 @@ numpy==1.23.5
# -r requirements/base.txt
# argilla
# pandas
openpyxl==3.1.1
openpyxl==3.1.2
# via
# -r requirements/base.txt
# unstructured (setup.py)
@ -112,7 +112,7 @@ pillow==9.4.0
# -r requirements/base.txt
# python-pptx
# unstructured (setup.py)
pydantic==1.10.6
pydantic==1.10.7
# via
# -r requirements/base.txt
# argilla
@ -120,6 +120,10 @@ pygments==2.14.0
# via
# -r requirements/base.txt
# rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2
# via
# -r requirements/base.txt
@ -140,7 +144,7 @@ pytz==2022.7.1
# via
# -r requirements/base.txt
# pandas
regex==2022.10.31
regex==2023.3.23
# via
# -r requirements/base.txt
# nltk
@ -179,7 +183,7 @@ typing-extensions==4.5.0
# -r requirements/base.txt
# pydantic
# rich
urllib3==1.26.14
urllib3==1.26.15
# via
# -r requirements/base.txt
# requests

View File

@ -10,7 +10,7 @@ anyio==3.6.2
# via
# httpcore
# starlette
argilla==1.4.0
argilla==1.5.0
# via unstructured (setup.py)
backoff==2.2.1
# via argilla
@ -46,15 +46,16 @@ effdet==0.3.0
# via layoutparser
et-xmlfile==1.1.0
# via openpyxl
fastapi==0.94.0
fastapi==0.95.0
# via unstructured-inference
filelock==3.9.0
filelock==3.10.3
# via
# huggingface-hub
# torch
# transformers
flatbuffers==23.3.3
# via onnxruntime
fonttools==4.39.0
fonttools==4.39.2
# via matplotlib
h11==0.14.0
# via
@ -64,7 +65,7 @@ httpcore==0.16.3
# via httpx
httpx==0.23.3
# via argilla
huggingface-hub==0.13.1
huggingface-hub==0.13.3
# via
# timm
# transformers
@ -76,12 +77,14 @@ idna==3.4
# anyio
# requests
# rfc3986
importlib-metadata==6.0.0
importlib-metadata==6.1.0
# via markdown
importlib-resources==5.12.0
# via matplotlib
iopath==0.1.10
# via layoutparser
jinja2==3.1.2
# via torch
joblib==1.2.0
# via nltk
kiwisolver==1.4.4
@ -93,14 +96,18 @@ lxml==4.9.2
# python-docx
# python-pptx
# unstructured (setup.py)
markdown==3.4.1
markdown==3.4.3
# via unstructured (setup.py)
markupsafe==2.1.2
# via jinja2
matplotlib==3.7.1
# via pycocotools
monotonic==1.6
# via argilla
mpmath==1.3.0
# via sympy
networkx==3.0
# via torch
nltk==3.8.1
# via unstructured (setup.py)
numpy==1.23.5
@ -124,7 +131,7 @@ opencv-python==4.6.0.66
# via
# layoutparser
# unstructured-inference
openpyxl==3.1.1
openpyxl==3.1.2
# via unstructured (setup.py)
packaging==23.0
# via
@ -163,12 +170,14 @@ pycocotools==2.0.6
# via effdet
pycparser==2.21
# via cffi
pydantic==1.10.6
pydantic==1.10.7
# via
# argilla
# fastapi
pygments==2.14.0
# via rich
pypandoc==1.11
# via unstructured (setup.py)
pyparsing==3.0.9
# via matplotlib
pytesseract==0.3.10
@ -194,7 +203,7 @@ pyyaml==6.0
# omegaconf
# timm
# transformers
regex==2022.10.31
regex==2023.3.23
# via
# nltk
# transformers
@ -217,21 +226,23 @@ sniffio==1.3.0
# anyio
# httpcore
# httpx
starlette==0.26.0.post1
starlette==0.26.1
# via fastapi
sympy==1.11.1
# via onnxruntime
# via
# onnxruntime
# torch
timm==0.6.12
# via effdet
tokenizers==0.13.2
# via transformers
torch==1.13.1
torch==2.0.0
# via
# effdet
# layoutparser
# timm
# torchvision
torchvision==0.14.1
torchvision==0.15.1
# via
# effdet
# layoutparser
@ -243,7 +254,7 @@ tqdm==4.65.0
# iopath
# nltk
# transformers
transformers==4.26.1
transformers==4.27.3
# via unstructured-inference
typing-extensions==4.5.0
# via
@ -253,12 +264,11 @@ typing-extensions==4.5.0
# rich
# starlette
# torch
# torchvision
unstructured-inference==0.2.11
# via unstructured (setup.py)
urllib3==1.26.14
urllib3==1.26.15
# via requests
uvicorn==0.21.0
uvicorn==0.21.1
# via unstructured-inference
wand==0.6.11
# via pdfplumber

View File

@ -8,6 +8,7 @@ flake8
mypy
types-Markdown
pytest-cov
pytest-mock
label_studio_sdk
types-requests
vcrpy

View File

@ -4,9 +4,6 @@
#
# pip-compile requirements/test.in
#
--extra-index-url https://pypi.ngc.nvidia.com
--trusted-host pypi.ngc.nvidia.com
appdirs==1.4.4
# via label-studio-tools
attrs==22.2.0
@ -23,7 +20,7 @@ click==8.1.3
# via
# -r requirements/test.in
# black
coverage[toml]==7.2.1
coverage[toml]==7.2.2
# via
# -r requirements/test.in
# pytest-cov
@ -67,19 +64,23 @@ pluggy==1.0.0
# via pytest
pycodestyle==2.10.0
# via flake8
pydantic==1.10.6
pydantic==1.10.7
# via label-studio-sdk
pyflakes==3.0.1
# via flake8
pytest==7.2.2
# via pytest-cov
# via
# pytest-cov
# pytest-mock
pytest-cov==4.0.0
# via -r requirements/test.in
pytest-mock==3.10.0
# via -r requirements/test.in
pyyaml==6.0
# via vcrpy
requests==2.28.2
# via label-studio-sdk
ruff==0.0.256
ruff==0.0.259
# via -r requirements/test.in
six==1.16.0
# via vcrpy
@ -91,7 +92,7 @@ tomli==2.0.1
# pytest
types-markdown==3.4.2.5
# via -r requirements/test.in
types-requests==2.28.11.15
types-requests==2.28.11.16
# via -r requirements/test.in
types-urllib3==1.26.25.8
# via types-requests

View File

@ -4,6 +4,7 @@ from unittest.mock import patch
import pytest
import requests
from requests.models import Response
from unstructured.documents.elements import PageBreak
from unstructured.partition.html import partition_html
@ -86,6 +87,25 @@ def test_partition_html_from_url_raises_with_bad_content_type():
partition_html(url="https://fake.url")
def test_partition_from_url_uses_headers(mocker):
test_url = "https://example.com"
test_headers = {"User-Agent": "test"}
response = Response()
response.status_code = 200
response._content = (
b"<html><head></head><body><p>What do i know? Who needs to know it?</p></body></html>"
)
response.headers = {"Content-Type": "text/html"}
mock_get = mocker.patch("requests.get", return_value=response)
partition_html(url=test_url, headers=test_headers)
# Check if requests.get was called with the correct arguments
mock_get.assert_called_once_with(test_url, headers=test_headers)
def test_partition_html_raises_with_none_specified():
with pytest.raises(ValueError):
partition_html()

View File

@ -1 +1 @@
__version__ = "0.5.7-dev2" # pragma: no cover
__version__ = "0.5.7-dev3" # pragma: no cover

View File

@ -1,4 +1,4 @@
from typing import IO, List, Optional
from typing import IO, Dict, List, Optional
import requests
@ -20,6 +20,7 @@ def partition_html(
encoding: Optional[str] = None,
include_page_breaks: bool = False,
include_metadata: bool = True,
headers: Dict[str, str] = {},
parser: VALID_PARSERS = None,
) -> List[Element]:
"""Partitions an HTML document into its constituent elements.
@ -67,7 +68,7 @@ def partition_html(
document = HTMLDocument.from_string(_text, parser=parser)
elif url is not None:
response = requests.get(url)
response = requests.get(url, headers=headers)
if not response.ok:
raise ValueError(f"URL return an error: {response.status_code}")