feat: add ability to pass headers in partition_html (#397)

Also adds pytest-mock requirement, those fixtures are nice to have!

Implements issue/feature #396 .
This commit is contained in:
cragwolfe 2023-03-23 20:14:57 -07:00 committed by GitHub
parent a4394f6f16
commit ce9fc26009
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 247 additions and 147 deletions

View File

@ -1,8 +1,9 @@
## 0.5.7-dev2 ## 0.5.7-dev3
### Enhancements ### Enhancements
* Refactored codebase using `exactly_one` * Refactored codebase using `exactly_one`
* Adds ability to pass headers when passing a url in partition_html()
### Features ### Features

View File

@ -8,7 +8,7 @@ alabaster==0.7.13
# via sphinx # via sphinx
babel==2.12.1 babel==2.12.1
# via sphinx # via sphinx
beautifulsoup4==4.11.2 beautifulsoup4==4.12.0
# via furo # via furo
certifi==2022.12.7 certifi==2022.12.7
# via # via
@ -20,13 +20,13 @@ docutils==0.18.1
# via # via
# sphinx # sphinx
# sphinx-rtd-theme # sphinx-rtd-theme
furo==2022.12.7 furo==2023.3.23
# via -r requirements/build.in # via -r requirements/build.in
idna==3.4 idna==3.4
# via requests # via requests
imagesize==1.4.1 imagesize==1.4.1
# via sphinx # via sphinx
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via sphinx # via sphinx
jinja2==3.1.2 jinja2==3.1.2
# via sphinx # via sphinx
@ -52,6 +52,7 @@ sphinx==6.1.3
# furo # furo
# sphinx-basic-ng # sphinx-basic-ng
# sphinx-rtd-theme # sphinx-rtd-theme
# sphinxcontrib-jquery
sphinx-basic-ng==1.0.0b1 sphinx-basic-ng==1.0.0b1
# via furo # via furo
sphinx-rtd-theme==1.2.0rc3 sphinx-rtd-theme==1.2.0rc3
@ -62,7 +63,7 @@ sphinxcontrib-devhelp==1.0.2
# via sphinx # via sphinx
sphinxcontrib-htmlhelp==2.0.1 sphinxcontrib-htmlhelp==2.0.1
# via sphinx # via sphinx
sphinxcontrib-jquery==3.0.0 sphinxcontrib-jquery==4.1
# via sphinx-rtd-theme # via sphinx-rtd-theme
sphinxcontrib-jsmath==1.0.1 sphinxcontrib-jsmath==1.0.1
# via sphinx # via sphinx
@ -70,10 +71,7 @@ sphinxcontrib-qthelp==1.0.3
# via sphinx # via sphinx
sphinxcontrib-serializinghtml==1.1.5 sphinxcontrib-serializinghtml==1.1.5
# via sphinx # via sphinx
urllib3==1.26.14 urllib3==1.26.15
# via requests # via requests
zipp==3.15.0 zipp==3.15.0
# via importlib-metadata # via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:
# setuptools

View File

@ -210,10 +210,13 @@ Examples:
The ``partition_html`` function partitions an HTML document and returns a list The ``partition_html`` function partitions an HTML document and returns a list
of document ``Element`` objects. ``partition_html`` can take a filename, file-like of document ``Element`` objects. ``partition_html`` can take a filename, file-like
object, or string as input. The three examples below all produce the same output. object, string, or url as input.
Examples: Examples:
These three invocations of partition_html() result are essentially equivalent:
.. code:: python .. code:: python
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
@ -228,6 +231,22 @@ Examples:
elements = partition_html(text=text) elements = partition_html(text=text)
The following illustrates fetching a url and partition it the response content.
.. code:: python
from unstructured.partition.html import partition_html
elements = partition_html(url="https://python.org/")
# you can also provide custom headers:
elements = partition_html(url="https://python.org/",
headers={"User-Agent": "YourScriptName/1.0 ..."})
``partition_pdf`` ``partition_pdf``
--------------------- ---------------------

View File

@ -65,7 +65,7 @@ def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict:
response.raise_for_status() response.raise_for_status()
content = json.loads(response.content) content = json.loads(response.content)
recent_forms = content["filings"]["recent"] recent_forms = content["filings"]["recent"]
form_types = {k: v for k, v in zip(recent_forms["accessionNumber"], recent_forms["form"])} form_types = dict(zip(recent_forms["accessionNumber"], recent_forms["form"]))
return form_types return form_types

View File

@ -4,12 +4,9 @@
# #
# pip-compile --output-file=requirements/base.txt # pip-compile --output-file=requirements/base.txt
# #
--extra-index-url https://pypi.ngc.nvidia.com
--trusted-host pypi.ngc.nvidia.com
anyio==3.6.2 anyio==3.6.2
# via httpcore # via httpcore
argilla==1.4.0 argilla==1.5.0
# via unstructured (setup.py) # via unstructured (setup.py)
backoff==2.2.1 backoff==2.2.1
# via argilla # via argilla
@ -40,7 +37,7 @@ idna==3.4
# anyio # anyio
# requests # requests
# rfc3986 # rfc3986
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via markdown # via markdown
joblib==1.2.0 joblib==1.2.0
# via nltk # via nltk
@ -49,7 +46,7 @@ lxml==4.9.2
# python-docx # python-docx
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
markdown==3.4.1 markdown==3.4.3
# via unstructured (setup.py) # via unstructured (setup.py)
monotonic==1.6 monotonic==1.6
# via argilla # via argilla
@ -59,7 +56,7 @@ numpy==1.23.5
# via # via
# argilla # argilla
# pandas # pandas
openpyxl==3.1.1 openpyxl==3.1.2
# via unstructured (setup.py) # via unstructured (setup.py)
packaging==23.0 packaging==23.0
# via argilla # via argilla
@ -71,7 +68,7 @@ pillow==9.4.0
# via # via
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
pydantic==1.10.6 pydantic==1.10.7
# via argilla # via argilla
pygments==2.14.0 pygments==2.14.0
# via rich # via rich
@ -87,7 +84,7 @@ python-pptx==0.6.21
# via unstructured (setup.py) # via unstructured (setup.py)
pytz==2022.7.1 pytz==2022.7.1
# via pandas # via pandas
regex==2022.10.31 regex==2023.3.23
# via nltk # via nltk
requests==2.28.2 requests==2.28.2
# via unstructured (setup.py) # via unstructured (setup.py)
@ -110,7 +107,7 @@ typing-extensions==4.5.0
# via # via
# pydantic # pydantic
# rich # rich
urllib3==1.26.14 urllib3==1.26.15
# via requests # via requests
wrapt==1.14.1 wrapt==1.14.1
# via # via

View File

@ -8,7 +8,7 @@ alabaster==0.7.13
# via sphinx # via sphinx
babel==2.12.1 babel==2.12.1
# via sphinx # via sphinx
beautifulsoup4==4.11.2 beautifulsoup4==4.12.0
# via furo # via furo
certifi==2022.12.7 certifi==2022.12.7
# via # via
@ -20,13 +20,13 @@ docutils==0.18.1
# via # via
# sphinx # sphinx
# sphinx-rtd-theme # sphinx-rtd-theme
furo==2022.12.7 furo==2023.3.23
# via -r requirements/build.in # via -r requirements/build.in
idna==3.4 idna==3.4
# via requests # via requests
imagesize==1.4.1 imagesize==1.4.1
# via sphinx # via sphinx
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via sphinx # via sphinx
jinja2==3.1.2 jinja2==3.1.2
# via sphinx # via sphinx
@ -52,6 +52,7 @@ sphinx==6.1.3
# furo # furo
# sphinx-basic-ng # sphinx-basic-ng
# sphinx-rtd-theme # sphinx-rtd-theme
# sphinxcontrib-jquery
sphinx-basic-ng==1.0.0b1 sphinx-basic-ng==1.0.0b1
# via furo # via furo
sphinx-rtd-theme==1.2.0rc3 sphinx-rtd-theme==1.2.0rc3
@ -62,7 +63,7 @@ sphinxcontrib-devhelp==1.0.2
# via sphinx # via sphinx
sphinxcontrib-htmlhelp==2.0.1 sphinxcontrib-htmlhelp==2.0.1
# via sphinx # via sphinx
sphinxcontrib-jquery==3.0.0 sphinxcontrib-jquery==4.1
# via sphinx-rtd-theme # via sphinx-rtd-theme
sphinxcontrib-jsmath==1.0.1 sphinxcontrib-jsmath==1.0.1
# via sphinx # via sphinx
@ -70,10 +71,7 @@ sphinxcontrib-qthelp==1.0.3
# via sphinx # via sphinx
sphinxcontrib-serializinghtml==1.1.5 sphinxcontrib-serializinghtml==1.1.5
# via sphinx # via sphinx
urllib3==1.26.14 urllib3==1.26.15
# via requests # via requests
zipp==3.15.0 zipp==3.15.0
# via importlib-metadata # via importlib-metadata
# The following packages are considered to be unsafe in a requirements file:
# setuptools

View File

@ -25,7 +25,7 @@ attrs==22.2.0
# via jsonschema # via jsonschema
backcall==0.2.0 backcall==0.2.0
# via ipython # via ipython
beautifulsoup4==4.11.2 beautifulsoup4==4.12.0
# via nbconvert # via nbconvert
bleach==6.0.0 bleach==6.0.0
# via nbconvert # via nbconvert
@ -37,7 +37,7 @@ cfgv==3.3.1
# via pre-commit # via pre-commit
click==8.1.3 click==8.1.3
# via pip-tools # via pip-tools
comm==0.1.2 comm==0.1.3
# via ipykernel # via ipykernel
debugpy==1.6.6 debugpy==1.6.6
# via ipykernel # via ipykernel
@ -51,25 +51,24 @@ executing==1.2.0
# via stack-data # via stack-data
fastjsonschema==2.16.3 fastjsonschema==2.16.3
# via nbformat # via nbformat
filelock==3.9.0 filelock==3.10.3
# via virtualenv # via virtualenv
fqdn==1.5.1 fqdn==1.5.1
# via jsonschema # via jsonschema
identify==2.5.19 identify==2.5.21
# via pre-commit # via pre-commit
idna==3.4 idna==3.4
# via # via
# anyio # anyio
# jsonschema # jsonschema
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via # via
# jupyter-client # jupyter-client
# nbconvert # nbconvert
importlib-resources==5.12.0 importlib-resources==5.12.0
# via jsonschema # via jsonschema
ipykernel==6.21.3 ipykernel==6.22.0
# via # via
# ipywidgets
# jupyter # jupyter
# jupyter-console # jupyter-console
# nbclassic # nbclassic
@ -86,7 +85,7 @@ ipython-genutils==0.2.0
# nbclassic # nbclassic
# notebook # notebook
# qtconsole # qtconsole
ipywidgets==8.0.4 ipywidgets==8.0.5
# via jupyter # via jupyter
isoduration==20.11.0 isoduration==20.11.0
# via jsonschema # via jsonschema
@ -106,7 +105,7 @@ jsonschema[format-nongpl]==4.17.3
# nbformat # nbformat
jupyter==1.0.0 jupyter==1.0.0
# via -r requirements/dev.in # via -r requirements/dev.in
jupyter-client==8.0.3 jupyter-client==8.1.0
# via # via
# ipykernel # ipykernel
# jupyter-console # jupyter-console
@ -117,7 +116,7 @@ jupyter-client==8.0.3
# qtconsole # qtconsole
jupyter-console==6.6.3 jupyter-console==6.6.3
# via jupyter # via jupyter
jupyter-core==5.2.0 jupyter-core==5.3.0
# via # via
# -r requirements/dev.in # -r requirements/dev.in
# ipykernel # ipykernel
@ -132,7 +131,7 @@ jupyter-core==5.2.0
# qtconsole # qtconsole
jupyter-events==0.6.3 jupyter-events==0.6.3
# via jupyter-server # via jupyter-server
jupyter-server==2.4.0 jupyter-server==2.5.0
# via # via
# nbclassic # nbclassic
# notebook-shim # notebook-shim
@ -140,7 +139,7 @@ jupyter-server-terminals==0.4.4
# via jupyter-server # via jupyter-server
jupyterlab-pygments==0.2.2 jupyterlab-pygments==0.2.2
# via nbconvert # via nbconvert
jupyterlab-widgets==3.0.5 jupyterlab-widgets==3.0.6
# via ipywidgets # via ipywidgets
markupsafe==2.1.2 markupsafe==2.1.2
# via # via
@ -156,13 +155,13 @@ nbclassic==0.5.3
# via notebook # via notebook
nbclient==0.7.2 nbclient==0.7.2
# via nbconvert # via nbconvert
nbconvert==7.2.9 nbconvert==7.2.10
# via # via
# jupyter # jupyter
# jupyter-server # jupyter-server
# nbclassic # nbclassic
# notebook # notebook
nbformat==5.7.3 nbformat==5.8.0
# via # via
# jupyter-server # jupyter-server
# nbclassic # nbclassic
@ -186,6 +185,7 @@ packaging==23.0
# ipykernel # ipykernel
# jupyter-server # jupyter-server
# nbconvert # nbconvert
# qtconsole
# qtpy # qtpy
pandocfilters==1.5.0 pandocfilters==1.5.0
# via nbconvert # via nbconvert
@ -203,7 +203,7 @@ platformdirs==3.1.1
# via # via
# jupyter-core # jupyter-core
# virtualenv # virtualenv
pre-commit==3.1.1 pre-commit==3.2.0
# via -r requirements/dev.in # via -r requirements/dev.in
prometheus-client==0.16.0 prometheus-client==0.16.0
# via # via
@ -244,7 +244,7 @@ pyyaml==6.0
# via # via
# jupyter-events # jupyter-events
# pre-commit # pre-commit
pyzmq==25.0.0 pyzmq==25.0.2
# via # via
# ipykernel # ipykernel
# jupyter-client # jupyter-client
@ -253,7 +253,7 @@ pyzmq==25.0.0
# nbclassic # nbclassic
# notebook # notebook
# qtconsole # qtconsole
qtconsole==5.4.0 qtconsole==5.4.1
# via jupyter # via jupyter
qtpy==2.3.0 qtpy==2.3.0
# via qtconsole # via qtconsole
@ -322,7 +322,7 @@ traitlets==5.9.0
# qtconsole # qtconsole
uri-template==1.2.0 uri-template==1.2.0
# via jsonschema # via jsonschema
virtualenv==20.20.0 virtualenv==20.21.0
# via pre-commit # via pre-commit
wcwidth==0.2.6 wcwidth==0.2.6
# via prompt-toolkit # via prompt-toolkit
@ -334,11 +334,11 @@ webencodings==0.5.1
# tinycss2 # tinycss2
websocket-client==1.5.1 websocket-client==1.5.1
# via jupyter-server # via jupyter-server
wheel==0.38.4 wheel==0.40.0
# via # via
# -r requirements/dev.in # -r requirements/dev.in
# pip-tools # pip-tools
widgetsnbextension==4.0.5 widgetsnbextension==4.0.6
# via ipywidgets # via ipywidgets
zipp==3.15.0 zipp==3.15.0
# via # via

View File

@ -6,7 +6,7 @@
# #
anyio==3.6.2 anyio==3.6.2
# via httpcore # via httpcore
argilla==1.4.0 argilla==1.5.0
# via unstructured (setup.py) # via unstructured (setup.py)
backoff==2.2.1 backoff==2.2.1
# via argilla # via argilla
@ -28,9 +28,10 @@ deprecated==1.2.13
# via argilla # via argilla
et-xmlfile==1.1.0 et-xmlfile==1.1.0
# via openpyxl # via openpyxl
filelock==3.9.0 filelock==3.10.3
# via # via
# huggingface-hub # huggingface-hub
# torch
# transformers # transformers
h11==0.14.0 h11==0.14.0
# via httpcore # via httpcore
@ -38,15 +39,17 @@ httpcore==0.16.3
# via httpx # via httpx
httpx==0.23.3 httpx==0.23.3
# via argilla # via argilla
huggingface-hub==0.13.1 huggingface-hub==0.13.3
# via transformers # via transformers
idna==3.4 idna==3.4
# via # via
# anyio # anyio
# requests # requests
# rfc3986 # rfc3986
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via markdown # via markdown
jinja2==3.1.2
# via torch
joblib==1.2.0 joblib==1.2.0
# via # via
# nltk # nltk
@ -58,10 +61,16 @@ lxml==4.9.2
# python-docx # python-docx
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
markdown==3.4.1 markdown==3.4.3
# via unstructured (setup.py) # via unstructured (setup.py)
markupsafe==2.1.2
# via jinja2
monotonic==1.6 monotonic==1.6
# via argilla # via argilla
mpmath==1.3.0
# via sympy
networkx==3.0
# via torch
nltk==3.8.1 nltk==3.8.1
# via unstructured (setup.py) # via unstructured (setup.py)
numpy==1.23.5 numpy==1.23.5
@ -69,7 +78,7 @@ numpy==1.23.5
# argilla # argilla
# pandas # pandas
# transformers # transformers
openpyxl==3.1.1 openpyxl==3.1.2
# via unstructured (setup.py) # via unstructured (setup.py)
packaging==23.0 packaging==23.0
# via # via
@ -84,10 +93,12 @@ pillow==9.4.0
# via # via
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
pydantic==1.10.6 pydantic==1.10.7
# via argilla # via argilla
pygments==2.14.0 pygments==2.14.0
# via rich # via rich
pypandoc==1.11
# via unstructured (setup.py)
python-dateutil==2.8.2 python-dateutil==2.8.2
# via pandas # via pandas
python-docx==0.8.11 python-docx==0.8.11
@ -102,7 +113,7 @@ pyyaml==6.0
# via # via
# huggingface-hub # huggingface-hub
# transformers # transformers
regex==2022.10.31 regex==2023.3.23
# via # via
# nltk # nltk
# sacremoses # sacremoses
@ -130,9 +141,11 @@ sniffio==1.3.0
# anyio # anyio
# httpcore # httpcore
# httpx # httpx
sympy==1.11.1
# via torch
tokenizers==0.13.2 tokenizers==0.13.2
# via transformers # via transformers
torch==1.13.1 torch==2.0.0
# via unstructured (setup.py) # via unstructured (setup.py)
tqdm==4.65.0 tqdm==4.65.0
# via # via
@ -141,7 +154,7 @@ tqdm==4.65.0
# nltk # nltk
# sacremoses # sacremoses
# transformers # transformers
transformers==4.26.1 transformers==4.27.3
# via unstructured (setup.py) # via unstructured (setup.py)
typing-extensions==4.5.0 typing-extensions==4.5.0
# via # via
@ -149,7 +162,7 @@ typing-extensions==4.5.0
# pydantic # pydantic
# rich # rich
# torch # torch
urllib3==1.26.14 urllib3==1.26.15
# via requests # via requests
wrapt==1.14.1 wrapt==1.14.1
# via # via

View File

@ -16,7 +16,7 @@ anyio==3.6.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# httpcore # httpcore
argilla==1.3.1 argilla==1.5.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -50,7 +50,7 @@ cffi==1.15.1
# via # via
# azure-datalake-store # azure-datalake-store
# cryptography # cryptography
charset-normalizer==3.0.1 charset-normalizer==3.1.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# aiohttp # aiohttp
@ -59,7 +59,11 @@ click==8.1.3
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# nltk # nltk
cryptography==39.0.1 commonmark==0.9.1
# via
# -r requirements/base.txt
# rich
cryptography==39.0.2
# via # via
# adal # adal
# azure-identity # azure-identity
@ -78,7 +82,7 @@ frozenlist==1.3.3
# via # via
# aiohttp # aiohttp
# aiosignal # aiosignal
fsspec==2023.1.0 fsspec==2023.3.0
# via # via
# adlfs # adlfs
# unstructured (setup.py) # unstructured (setup.py)
@ -101,7 +105,7 @@ idna==3.4
# requests # requests
# rfc3986 # rfc3986
# yarl # yarl
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# markdown # markdown
@ -117,7 +121,7 @@ lxml==4.9.2
# python-docx # python-docx
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
markdown==3.4.1 markdown==3.4.3
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -144,7 +148,7 @@ numpy==1.23.5
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
# pandas # pandas
openpyxl==3.1.1 openpyxl==3.1.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -166,14 +170,22 @@ portalocker==2.7.0
# via msal-extensions # via msal-extensions
pycparser==2.21 pycparser==2.21
# via cffi # via cffi
pydantic==1.10.5 pydantic==1.10.7
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
pygments==2.14.0
# via
# -r requirements/base.txt
# rich
pyjwt[crypto]==2.6.0 pyjwt[crypto]==2.6.0
# via # via
# adal # adal
# msal # msal
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2 python-dateutil==2.8.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
@ -195,7 +207,7 @@ pytz==2022.7.1
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# pandas # pandas
regex==2022.10.31 regex==2023.3.23
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# nltk # nltk
@ -211,6 +223,10 @@ rfc3986[idna2008]==1.5.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# httpx # httpx
rich==13.0.1
# via
# -r requirements/base.txt
# argilla
six==1.16.0 six==1.16.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
@ -224,7 +240,7 @@ sniffio==1.3.0
# anyio # anyio
# httpcore # httpcore
# httpx # httpx
tqdm==4.64.1 tqdm==4.65.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
@ -235,7 +251,8 @@ typing-extensions==4.5.0
# azure-core # azure-core
# azure-storage-blob # azure-storage-blob
# pydantic # pydantic
urllib3==1.26.14 # rich
urllib3==1.26.15
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# requests # requests
@ -244,7 +261,7 @@ wrapt==1.14.1
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
# deprecated # deprecated
xlsxwriter==3.0.8 xlsxwriter==3.0.9
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# python-pptx # python-pptx

View File

@ -8,7 +8,7 @@ anyio==3.6.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# httpcore # httpcore
argilla==1.4.0 argilla==1.5.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -64,7 +64,7 @@ idna==3.4
# anyio # anyio
# requests # requests
# rfc3986 # rfc3986
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# markdown # markdown
@ -78,7 +78,7 @@ lxml==4.9.2
# python-docx # python-docx
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
markdown==3.4.1 markdown==3.4.3
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -95,7 +95,7 @@ numpy==1.23.5
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
# pandas # pandas
openpyxl==3.1.1 openpyxl==3.1.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -115,7 +115,7 @@ pillow==9.4.0
# unstructured (setup.py) # unstructured (setup.py)
pycparser==2.21 pycparser==2.21
# via cffi # via cffi
pydantic==1.10.6 pydantic==1.10.7
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
@ -129,6 +129,10 @@ pyjwt==2.6.0
# via pygithub # via pygithub
pynacl==1.5.0 pynacl==1.5.0
# via pygithub # via pygithub
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2 python-dateutil==2.8.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
@ -149,7 +153,7 @@ pytz==2022.7.1
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# pandas # pandas
regex==2022.10.31 regex==2023.3.23
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# nltk # nltk
@ -186,7 +190,7 @@ typing-extensions==4.5.0
# -r requirements/base.txt # -r requirements/base.txt
# pydantic # pydantic
# rich # rich
urllib3==1.26.14 urllib3==1.26.15
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# requests # requests

View File

@ -8,7 +8,7 @@ anyio==3.6.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# httpcore # httpcore
argilla==1.4.0 argilla==1.5.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -61,7 +61,7 @@ idna==3.4
# anyio # anyio
# requests # requests
# rfc3986 # rfc3986
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# markdown # markdown
@ -75,7 +75,7 @@ lxml==4.9.2
# python-docx # python-docx
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
markdown==3.4.1 markdown==3.4.3
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -92,7 +92,7 @@ numpy==1.23.5
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
# pandas # pandas
openpyxl==3.1.1 openpyxl==3.1.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -110,7 +110,7 @@ pillow==9.4.0
# -r requirements/base.txt # -r requirements/base.txt
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
pydantic==1.10.6 pydantic==1.10.7
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
@ -118,6 +118,10 @@ pygments==2.14.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# rich # rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2 python-dateutil==2.8.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
@ -140,7 +144,7 @@ pytz==2022.7.1
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# pandas # pandas
regex==2022.10.31 regex==2023.3.23
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# nltk # nltk
@ -180,7 +184,7 @@ typing-extensions==4.5.0
# -r requirements/base.txt # -r requirements/base.txt
# pydantic # pydantic
# rich # rich
urllib3==1.26.14 urllib3==1.26.15
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# requests # requests

View File

@ -8,7 +8,7 @@ anyio==3.6.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# httpcore # httpcore
argilla==1.4.0 argilla==1.5.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -47,7 +47,7 @@ et-xmlfile==1.1.0
# openpyxl # openpyxl
google-api-core==2.11.0 google-api-core==2.11.0
# via google-api-python-client # via google-api-python-client
google-api-python-client==2.80.0 google-api-python-client==2.82.0
# via unstructured (setup.py) # via unstructured (setup.py)
google-auth==2.16.2 google-auth==2.16.2
# via # via
@ -56,7 +56,7 @@ google-auth==2.16.2
# google-auth-httplib2 # google-auth-httplib2
google-auth-httplib2==0.1.0 google-auth-httplib2==0.1.0
# via google-api-python-client # via google-api-python-client
googleapis-common-protos==1.58.0 googleapis-common-protos==1.59.0
# via google-api-core # via google-api-core
h11==0.14.0 h11==0.14.0
# via # via
@ -66,7 +66,7 @@ httpcore==0.16.3
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# httpx # httpx
httplib2==0.21.0 httplib2==0.22.0
# via # via
# google-api-python-client # google-api-python-client
# google-auth-httplib2 # google-auth-httplib2
@ -80,7 +80,7 @@ idna==3.4
# anyio # anyio
# requests # requests
# rfc3986 # rfc3986
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# markdown # markdown
@ -94,7 +94,7 @@ lxml==4.9.2
# python-docx # python-docx
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
markdown==3.4.1 markdown==3.4.3
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -111,7 +111,7 @@ numpy==1.23.5
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
# pandas # pandas
openpyxl==3.1.1 openpyxl==3.1.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -139,7 +139,7 @@ pyasn1==0.4.8
# rsa # rsa
pyasn1-modules==0.2.8 pyasn1-modules==0.2.8
# via google-auth # via google-auth
pydantic==1.10.6 pydantic==1.10.7
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
@ -147,6 +147,10 @@ pygments==2.14.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# rich # rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
pyparsing==3.0.9 pyparsing==3.0.9
# via httplib2 # via httplib2
python-dateutil==2.8.2 python-dateutil==2.8.2
@ -169,7 +173,7 @@ pytz==2022.7.1
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# pandas # pandas
regex==2022.10.31 regex==2023.3.23
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# nltk # nltk
@ -212,7 +216,7 @@ typing-extensions==4.5.0
# rich # rich
uritemplate==4.1.1 uritemplate==4.1.1
# via google-api-python-client # via google-api-python-client
urllib3==1.26.14 urllib3==1.26.15
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# requests # requests

View File

@ -8,7 +8,7 @@ anyio==3.6.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# httpcore # httpcore
argilla==1.4.0 argilla==1.5.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -61,7 +61,7 @@ idna==3.4
# anyio # anyio
# requests # requests
# rfc3986 # rfc3986
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# markdown # markdown
@ -75,7 +75,7 @@ lxml==4.9.2
# python-docx # python-docx
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
markdown==3.4.1 markdown==3.4.3
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -92,7 +92,7 @@ numpy==1.23.5
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
# pandas # pandas
openpyxl==3.1.1 openpyxl==3.1.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -114,7 +114,7 @@ praw==7.7.0
# via unstructured (setup.py) # via unstructured (setup.py)
prawcore==2.3.0 prawcore==2.3.0
# via praw # via praw
pydantic==1.10.6 pydantic==1.10.7
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
@ -122,6 +122,10 @@ pygments==2.14.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# rich # rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2 python-dateutil==2.8.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
@ -142,7 +146,7 @@ pytz==2022.7.1
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# pandas # pandas
regex==2022.10.31 regex==2023.3.23
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# nltk # nltk
@ -182,7 +186,7 @@ typing-extensions==4.5.0
# rich # rich
update-checker==0.18.0 update-checker==0.18.0
# via praw # via praw
urllib3==1.26.14 urllib3==1.26.15
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# requests # requests

View File

@ -18,7 +18,7 @@ anyio==3.6.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# httpcore # httpcore
argilla==1.4.0 argilla==1.5.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -87,7 +87,7 @@ idna==3.4
# requests # requests
# rfc3986 # rfc3986
# yarl # yarl
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# markdown # markdown
@ -103,7 +103,7 @@ lxml==4.9.2
# python-docx # python-docx
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
markdown==3.4.1 markdown==3.4.3
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -124,7 +124,7 @@ numpy==1.23.5
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
# pandas # pandas
openpyxl==3.1.1 openpyxl==3.1.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -142,7 +142,7 @@ pillow==9.4.0
# -r requirements/base.txt # -r requirements/base.txt
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
pydantic==1.10.6 pydantic==1.10.7
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
@ -150,6 +150,10 @@ pygments==2.14.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# rich # rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2 python-dateutil==2.8.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
@ -171,7 +175,7 @@ pytz==2022.7.1
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# pandas # pandas
regex==2022.10.31 regex==2023.3.23
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# nltk # nltk
@ -210,7 +214,7 @@ typing-extensions==4.5.0
# aioitertools # aioitertools
# pydantic # pydantic
# rich # rich
urllib3==1.26.14 urllib3==1.26.15
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# botocore # botocore

View File

@ -8,7 +8,7 @@ anyio==3.6.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# httpcore # httpcore
argilla==1.4.0 argilla==1.5.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -16,7 +16,7 @@ backoff==2.2.1
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
beautifulsoup4==4.11.2 beautifulsoup4==4.12.0
# via wikipedia # via wikipedia
certifi==2022.12.7 certifi==2022.12.7
# via # via
@ -63,7 +63,7 @@ idna==3.4
# anyio # anyio
# requests # requests
# rfc3986 # rfc3986
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# markdown # markdown
@ -77,7 +77,7 @@ lxml==4.9.2
# python-docx # python-docx
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
markdown==3.4.1 markdown==3.4.3
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -94,7 +94,7 @@ numpy==1.23.5
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
# pandas # pandas
openpyxl==3.1.1 openpyxl==3.1.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# unstructured (setup.py) # unstructured (setup.py)
@ -112,7 +112,7 @@ pillow==9.4.0
# -r requirements/base.txt # -r requirements/base.txt
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
pydantic==1.10.6 pydantic==1.10.7
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# argilla # argilla
@ -120,6 +120,10 @@ pygments==2.14.0
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# rich # rich
pypandoc==1.11
# via
# -r requirements/base.txt
# unstructured (setup.py)
python-dateutil==2.8.2 python-dateutil==2.8.2
# via # via
# -r requirements/base.txt # -r requirements/base.txt
@ -140,7 +144,7 @@ pytz==2022.7.1
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# pandas # pandas
regex==2022.10.31 regex==2023.3.23
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# nltk # nltk
@ -179,7 +183,7 @@ typing-extensions==4.5.0
# -r requirements/base.txt # -r requirements/base.txt
# pydantic # pydantic
# rich # rich
urllib3==1.26.14 urllib3==1.26.15
# via # via
# -r requirements/base.txt # -r requirements/base.txt
# requests # requests

View File

@ -10,7 +10,7 @@ anyio==3.6.2
# via # via
# httpcore # httpcore
# starlette # starlette
argilla==1.4.0 argilla==1.5.0
# via unstructured (setup.py) # via unstructured (setup.py)
backoff==2.2.1 backoff==2.2.1
# via argilla # via argilla
@ -46,15 +46,16 @@ effdet==0.3.0
# via layoutparser # via layoutparser
et-xmlfile==1.1.0 et-xmlfile==1.1.0
# via openpyxl # via openpyxl
fastapi==0.94.0 fastapi==0.95.0
# via unstructured-inference # via unstructured-inference
filelock==3.9.0 filelock==3.10.3
# via # via
# huggingface-hub # huggingface-hub
# torch
# transformers # transformers
flatbuffers==23.3.3 flatbuffers==23.3.3
# via onnxruntime # via onnxruntime
fonttools==4.39.0 fonttools==4.39.2
# via matplotlib # via matplotlib
h11==0.14.0 h11==0.14.0
# via # via
@ -64,7 +65,7 @@ httpcore==0.16.3
# via httpx # via httpx
httpx==0.23.3 httpx==0.23.3
# via argilla # via argilla
huggingface-hub==0.13.1 huggingface-hub==0.13.3
# via # via
# timm # timm
# transformers # transformers
@ -76,12 +77,14 @@ idna==3.4
# anyio # anyio
# requests # requests
# rfc3986 # rfc3986
importlib-metadata==6.0.0 importlib-metadata==6.1.0
# via markdown # via markdown
importlib-resources==5.12.0 importlib-resources==5.12.0
# via matplotlib # via matplotlib
iopath==0.1.10 iopath==0.1.10
# via layoutparser # via layoutparser
jinja2==3.1.2
# via torch
joblib==1.2.0 joblib==1.2.0
# via nltk # via nltk
kiwisolver==1.4.4 kiwisolver==1.4.4
@ -93,14 +96,18 @@ lxml==4.9.2
# python-docx # python-docx
# python-pptx # python-pptx
# unstructured (setup.py) # unstructured (setup.py)
markdown==3.4.1 markdown==3.4.3
# via unstructured (setup.py) # via unstructured (setup.py)
markupsafe==2.1.2
# via jinja2
matplotlib==3.7.1 matplotlib==3.7.1
# via pycocotools # via pycocotools
monotonic==1.6 monotonic==1.6
# via argilla # via argilla
mpmath==1.3.0 mpmath==1.3.0
# via sympy # via sympy
networkx==3.0
# via torch
nltk==3.8.1 nltk==3.8.1
# via unstructured (setup.py) # via unstructured (setup.py)
numpy==1.23.5 numpy==1.23.5
@ -124,7 +131,7 @@ opencv-python==4.6.0.66
# via # via
# layoutparser # layoutparser
# unstructured-inference # unstructured-inference
openpyxl==3.1.1 openpyxl==3.1.2
# via unstructured (setup.py) # via unstructured (setup.py)
packaging==23.0 packaging==23.0
# via # via
@ -163,12 +170,14 @@ pycocotools==2.0.6
# via effdet # via effdet
pycparser==2.21 pycparser==2.21
# via cffi # via cffi
pydantic==1.10.6 pydantic==1.10.7
# via # via
# argilla # argilla
# fastapi # fastapi
pygments==2.14.0 pygments==2.14.0
# via rich # via rich
pypandoc==1.11
# via unstructured (setup.py)
pyparsing==3.0.9 pyparsing==3.0.9
# via matplotlib # via matplotlib
pytesseract==0.3.10 pytesseract==0.3.10
@ -194,7 +203,7 @@ pyyaml==6.0
# omegaconf # omegaconf
# timm # timm
# transformers # transformers
regex==2022.10.31 regex==2023.3.23
# via # via
# nltk # nltk
# transformers # transformers
@ -217,21 +226,23 @@ sniffio==1.3.0
# anyio # anyio
# httpcore # httpcore
# httpx # httpx
starlette==0.26.0.post1 starlette==0.26.1
# via fastapi # via fastapi
sympy==1.11.1 sympy==1.11.1
# via onnxruntime # via
# onnxruntime
# torch
timm==0.6.12 timm==0.6.12
# via effdet # via effdet
tokenizers==0.13.2 tokenizers==0.13.2
# via transformers # via transformers
torch==1.13.1 torch==2.0.0
# via # via
# effdet # effdet
# layoutparser # layoutparser
# timm # timm
# torchvision # torchvision
torchvision==0.14.1 torchvision==0.15.1
# via # via
# effdet # effdet
# layoutparser # layoutparser
@ -243,7 +254,7 @@ tqdm==4.65.0
# iopath # iopath
# nltk # nltk
# transformers # transformers
transformers==4.26.1 transformers==4.27.3
# via unstructured-inference # via unstructured-inference
typing-extensions==4.5.0 typing-extensions==4.5.0
# via # via
@ -253,12 +264,11 @@ typing-extensions==4.5.0
# rich # rich
# starlette # starlette
# torch # torch
# torchvision
unstructured-inference==0.2.11 unstructured-inference==0.2.11
# via unstructured (setup.py) # via unstructured (setup.py)
urllib3==1.26.14 urllib3==1.26.15
# via requests # via requests
uvicorn==0.21.0 uvicorn==0.21.1
# via unstructured-inference # via unstructured-inference
wand==0.6.11 wand==0.6.11
# via pdfplumber # via pdfplumber

View File

@ -8,6 +8,7 @@ flake8
mypy mypy
types-Markdown types-Markdown
pytest-cov pytest-cov
pytest-mock
label_studio_sdk label_studio_sdk
types-requests types-requests
vcrpy vcrpy

View File

@ -4,9 +4,6 @@
# #
# pip-compile requirements/test.in # pip-compile requirements/test.in
# #
--extra-index-url https://pypi.ngc.nvidia.com
--trusted-host pypi.ngc.nvidia.com
appdirs==1.4.4 appdirs==1.4.4
# via label-studio-tools # via label-studio-tools
attrs==22.2.0 attrs==22.2.0
@ -23,7 +20,7 @@ click==8.1.3
# via # via
# -r requirements/test.in # -r requirements/test.in
# black # black
coverage[toml]==7.2.1 coverage[toml]==7.2.2
# via # via
# -r requirements/test.in # -r requirements/test.in
# pytest-cov # pytest-cov
@ -67,19 +64,23 @@ pluggy==1.0.0
# via pytest # via pytest
pycodestyle==2.10.0 pycodestyle==2.10.0
# via flake8 # via flake8
pydantic==1.10.6 pydantic==1.10.7
# via label-studio-sdk # via label-studio-sdk
pyflakes==3.0.1 pyflakes==3.0.1
# via flake8 # via flake8
pytest==7.2.2 pytest==7.2.2
# via pytest-cov # via
# pytest-cov
# pytest-mock
pytest-cov==4.0.0 pytest-cov==4.0.0
# via -r requirements/test.in # via -r requirements/test.in
pytest-mock==3.10.0
# via -r requirements/test.in
pyyaml==6.0 pyyaml==6.0
# via vcrpy # via vcrpy
requests==2.28.2 requests==2.28.2
# via label-studio-sdk # via label-studio-sdk
ruff==0.0.256 ruff==0.0.259
# via -r requirements/test.in # via -r requirements/test.in
six==1.16.0 six==1.16.0
# via vcrpy # via vcrpy
@ -91,7 +92,7 @@ tomli==2.0.1
# pytest # pytest
types-markdown==3.4.2.5 types-markdown==3.4.2.5
# via -r requirements/test.in # via -r requirements/test.in
types-requests==2.28.11.15 types-requests==2.28.11.16
# via -r requirements/test.in # via -r requirements/test.in
types-urllib3==1.26.25.8 types-urllib3==1.26.25.8
# via types-requests # via types-requests

View File

@ -4,6 +4,7 @@ from unittest.mock import patch
import pytest import pytest
import requests import requests
from requests.models import Response
from unstructured.documents.elements import PageBreak from unstructured.documents.elements import PageBreak
from unstructured.partition.html import partition_html from unstructured.partition.html import partition_html
@ -86,6 +87,25 @@ def test_partition_html_from_url_raises_with_bad_content_type():
partition_html(url="https://fake.url") partition_html(url="https://fake.url")
def test_partition_from_url_uses_headers(mocker):
test_url = "https://example.com"
test_headers = {"User-Agent": "test"}
response = Response()
response.status_code = 200
response._content = (
b"<html><head></head><body><p>What do i know? Who needs to know it?</p></body></html>"
)
response.headers = {"Content-Type": "text/html"}
mock_get = mocker.patch("requests.get", return_value=response)
partition_html(url=test_url, headers=test_headers)
# Check if requests.get was called with the correct arguments
mock_get.assert_called_once_with(test_url, headers=test_headers)
def test_partition_html_raises_with_none_specified(): def test_partition_html_raises_with_none_specified():
with pytest.raises(ValueError): with pytest.raises(ValueError):
partition_html() partition_html()

View File

@ -1 +1 @@
__version__ = "0.5.7-dev2" # pragma: no cover __version__ = "0.5.7-dev3" # pragma: no cover

View File

@ -1,4 +1,4 @@
from typing import IO, List, Optional from typing import IO, Dict, List, Optional
import requests import requests
@ -20,6 +20,7 @@ def partition_html(
encoding: Optional[str] = None, encoding: Optional[str] = None,
include_page_breaks: bool = False, include_page_breaks: bool = False,
include_metadata: bool = True, include_metadata: bool = True,
headers: Dict[str, str] = {},
parser: VALID_PARSERS = None, parser: VALID_PARSERS = None,
) -> List[Element]: ) -> List[Element]:
"""Partitions an HTML document into its constituent elements. """Partitions an HTML document into its constituent elements.
@ -67,7 +68,7 @@ def partition_html(
document = HTMLDocument.from_string(_text, parser=parser) document = HTMLDocument.from_string(_text, parser=parser)
elif url is not None: elif url is not None:
response = requests.get(url) response = requests.get(url, headers=headers)
if not response.ok: if not response.ok:
raise ValueError(f"URL return an error: {response.status_code}") raise ValueError(f"URL return an error: {response.status_code}")