mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 19:13:13 +00:00
feat: add ability to pass headers in partition_html (#397)
Also adds pytest-mock requirement, those fixtures are nice to have! Implements issue/feature #396 .
This commit is contained in:
parent
a4394f6f16
commit
ce9fc26009
@ -1,8 +1,9 @@
|
||||
## 0.5.7-dev2
|
||||
## 0.5.7-dev3
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Refactored codebase using `exactly_one`
|
||||
* Adds ability to pass headers when passing a url in partition_html()
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ alabaster==0.7.13
|
||||
# via sphinx
|
||||
babel==2.12.1
|
||||
# via sphinx
|
||||
beautifulsoup4==4.11.2
|
||||
beautifulsoup4==4.12.0
|
||||
# via furo
|
||||
certifi==2022.12.7
|
||||
# via
|
||||
@ -20,13 +20,13 @@ docutils==0.18.1
|
||||
# via
|
||||
# sphinx
|
||||
# sphinx-rtd-theme
|
||||
furo==2022.12.7
|
||||
furo==2023.3.23
|
||||
# via -r requirements/build.in
|
||||
idna==3.4
|
||||
# via requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via sphinx
|
||||
jinja2==3.1.2
|
||||
# via sphinx
|
||||
@ -52,6 +52,7 @@ sphinx==6.1.3
|
||||
# furo
|
||||
# sphinx-basic-ng
|
||||
# sphinx-rtd-theme
|
||||
# sphinxcontrib-jquery
|
||||
sphinx-basic-ng==1.0.0b1
|
||||
# via furo
|
||||
sphinx-rtd-theme==1.2.0rc3
|
||||
@ -62,7 +63,7 @@ sphinxcontrib-devhelp==1.0.2
|
||||
# via sphinx
|
||||
sphinxcontrib-htmlhelp==2.0.1
|
||||
# via sphinx
|
||||
sphinxcontrib-jquery==3.0.0
|
||||
sphinxcontrib-jquery==4.1
|
||||
# via sphinx-rtd-theme
|
||||
sphinxcontrib-jsmath==1.0.1
|
||||
# via sphinx
|
||||
@ -70,10 +71,7 @@ sphinxcontrib-qthelp==1.0.3
|
||||
# via sphinx
|
||||
sphinxcontrib-serializinghtml==1.1.5
|
||||
# via sphinx
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via requests
|
||||
zipp==3.15.0
|
||||
# via importlib-metadata
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
# setuptools
|
||||
|
||||
@ -210,10 +210,13 @@ Examples:
|
||||
|
||||
The ``partition_html`` function partitions an HTML document and returns a list
|
||||
of document ``Element`` objects. ``partition_html`` can take a filename, file-like
|
||||
object, or string as input. The three examples below all produce the same output.
|
||||
object, string, or url as input.
|
||||
|
||||
Examples:
|
||||
|
||||
These three invocations of partition_html() result are essentially equivalent:
|
||||
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.html import partition_html
|
||||
@ -228,6 +231,22 @@ Examples:
|
||||
elements = partition_html(text=text)
|
||||
|
||||
|
||||
|
||||
The following illustrates fetching a url and partition it the response content.
|
||||
|
||||
.. code:: python
|
||||
|
||||
from unstructured.partition.html import partition_html
|
||||
|
||||
elements = partition_html(url="https://python.org/")
|
||||
|
||||
# you can also provide custom headers:
|
||||
|
||||
elements = partition_html(url="https://python.org/",
|
||||
headers={"User-Agent": "YourScriptName/1.0 ..."})
|
||||
|
||||
|
||||
|
||||
``partition_pdf``
|
||||
---------------------
|
||||
|
||||
|
||||
@ -65,7 +65,7 @@ def get_forms_by_cik(session: requests.Session, cik: Union[str, int]) -> dict:
|
||||
response.raise_for_status()
|
||||
content = json.loads(response.content)
|
||||
recent_forms = content["filings"]["recent"]
|
||||
form_types = {k: v for k, v in zip(recent_forms["accessionNumber"], recent_forms["form"])}
|
||||
form_types = dict(zip(recent_forms["accessionNumber"], recent_forms["form"]))
|
||||
return form_types
|
||||
|
||||
|
||||
|
||||
@ -4,12 +4,9 @@
|
||||
#
|
||||
# pip-compile --output-file=requirements/base.txt
|
||||
#
|
||||
--extra-index-url https://pypi.ngc.nvidia.com
|
||||
--trusted-host pypi.ngc.nvidia.com
|
||||
|
||||
anyio==3.6.2
|
||||
# via httpcore
|
||||
argilla==1.4.0
|
||||
argilla==1.5.0
|
||||
# via unstructured (setup.py)
|
||||
backoff==2.2.1
|
||||
# via argilla
|
||||
@ -40,7 +37,7 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via markdown
|
||||
joblib==1.2.0
|
||||
# via nltk
|
||||
@ -49,7 +46,7 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
markdown==3.4.3
|
||||
# via unstructured (setup.py)
|
||||
monotonic==1.6
|
||||
# via argilla
|
||||
@ -59,7 +56,7 @@ numpy==1.23.5
|
||||
# via
|
||||
# argilla
|
||||
# pandas
|
||||
openpyxl==3.1.1
|
||||
openpyxl==3.1.2
|
||||
# via unstructured (setup.py)
|
||||
packaging==23.0
|
||||
# via argilla
|
||||
@ -71,7 +68,7 @@ pillow==9.4.0
|
||||
# via
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
pydantic==1.10.6
|
||||
pydantic==1.10.7
|
||||
# via argilla
|
||||
pygments==2.14.0
|
||||
# via rich
|
||||
@ -87,7 +84,7 @@ python-pptx==0.6.21
|
||||
# via unstructured (setup.py)
|
||||
pytz==2022.7.1
|
||||
# via pandas
|
||||
regex==2022.10.31
|
||||
regex==2023.3.23
|
||||
# via nltk
|
||||
requests==2.28.2
|
||||
# via unstructured (setup.py)
|
||||
@ -110,7 +107,7 @@ typing-extensions==4.5.0
|
||||
# via
|
||||
# pydantic
|
||||
# rich
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via requests
|
||||
wrapt==1.14.1
|
||||
# via
|
||||
|
||||
@ -8,7 +8,7 @@ alabaster==0.7.13
|
||||
# via sphinx
|
||||
babel==2.12.1
|
||||
# via sphinx
|
||||
beautifulsoup4==4.11.2
|
||||
beautifulsoup4==4.12.0
|
||||
# via furo
|
||||
certifi==2022.12.7
|
||||
# via
|
||||
@ -20,13 +20,13 @@ docutils==0.18.1
|
||||
# via
|
||||
# sphinx
|
||||
# sphinx-rtd-theme
|
||||
furo==2022.12.7
|
||||
furo==2023.3.23
|
||||
# via -r requirements/build.in
|
||||
idna==3.4
|
||||
# via requests
|
||||
imagesize==1.4.1
|
||||
# via sphinx
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via sphinx
|
||||
jinja2==3.1.2
|
||||
# via sphinx
|
||||
@ -52,6 +52,7 @@ sphinx==6.1.3
|
||||
# furo
|
||||
# sphinx-basic-ng
|
||||
# sphinx-rtd-theme
|
||||
# sphinxcontrib-jquery
|
||||
sphinx-basic-ng==1.0.0b1
|
||||
# via furo
|
||||
sphinx-rtd-theme==1.2.0rc3
|
||||
@ -62,7 +63,7 @@ sphinxcontrib-devhelp==1.0.2
|
||||
# via sphinx
|
||||
sphinxcontrib-htmlhelp==2.0.1
|
||||
# via sphinx
|
||||
sphinxcontrib-jquery==3.0.0
|
||||
sphinxcontrib-jquery==4.1
|
||||
# via sphinx-rtd-theme
|
||||
sphinxcontrib-jsmath==1.0.1
|
||||
# via sphinx
|
||||
@ -70,10 +71,7 @@ sphinxcontrib-qthelp==1.0.3
|
||||
# via sphinx
|
||||
sphinxcontrib-serializinghtml==1.1.5
|
||||
# via sphinx
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via requests
|
||||
zipp==3.15.0
|
||||
# via importlib-metadata
|
||||
|
||||
# The following packages are considered to be unsafe in a requirements file:
|
||||
# setuptools
|
||||
|
||||
@ -25,7 +25,7 @@ attrs==22.2.0
|
||||
# via jsonschema
|
||||
backcall==0.2.0
|
||||
# via ipython
|
||||
beautifulsoup4==4.11.2
|
||||
beautifulsoup4==4.12.0
|
||||
# via nbconvert
|
||||
bleach==6.0.0
|
||||
# via nbconvert
|
||||
@ -37,7 +37,7 @@ cfgv==3.3.1
|
||||
# via pre-commit
|
||||
click==8.1.3
|
||||
# via pip-tools
|
||||
comm==0.1.2
|
||||
comm==0.1.3
|
||||
# via ipykernel
|
||||
debugpy==1.6.6
|
||||
# via ipykernel
|
||||
@ -51,25 +51,24 @@ executing==1.2.0
|
||||
# via stack-data
|
||||
fastjsonschema==2.16.3
|
||||
# via nbformat
|
||||
filelock==3.9.0
|
||||
filelock==3.10.3
|
||||
# via virtualenv
|
||||
fqdn==1.5.1
|
||||
# via jsonschema
|
||||
identify==2.5.19
|
||||
identify==2.5.21
|
||||
# via pre-commit
|
||||
idna==3.4
|
||||
# via
|
||||
# anyio
|
||||
# jsonschema
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via
|
||||
# jupyter-client
|
||||
# nbconvert
|
||||
importlib-resources==5.12.0
|
||||
# via jsonschema
|
||||
ipykernel==6.21.3
|
||||
ipykernel==6.22.0
|
||||
# via
|
||||
# ipywidgets
|
||||
# jupyter
|
||||
# jupyter-console
|
||||
# nbclassic
|
||||
@ -86,7 +85,7 @@ ipython-genutils==0.2.0
|
||||
# nbclassic
|
||||
# notebook
|
||||
# qtconsole
|
||||
ipywidgets==8.0.4
|
||||
ipywidgets==8.0.5
|
||||
# via jupyter
|
||||
isoduration==20.11.0
|
||||
# via jsonschema
|
||||
@ -106,7 +105,7 @@ jsonschema[format-nongpl]==4.17.3
|
||||
# nbformat
|
||||
jupyter==1.0.0
|
||||
# via -r requirements/dev.in
|
||||
jupyter-client==8.0.3
|
||||
jupyter-client==8.1.0
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-console
|
||||
@ -117,7 +116,7 @@ jupyter-client==8.0.3
|
||||
# qtconsole
|
||||
jupyter-console==6.6.3
|
||||
# via jupyter
|
||||
jupyter-core==5.2.0
|
||||
jupyter-core==5.3.0
|
||||
# via
|
||||
# -r requirements/dev.in
|
||||
# ipykernel
|
||||
@ -132,7 +131,7 @@ jupyter-core==5.2.0
|
||||
# qtconsole
|
||||
jupyter-events==0.6.3
|
||||
# via jupyter-server
|
||||
jupyter-server==2.4.0
|
||||
jupyter-server==2.5.0
|
||||
# via
|
||||
# nbclassic
|
||||
# notebook-shim
|
||||
@ -140,7 +139,7 @@ jupyter-server-terminals==0.4.4
|
||||
# via jupyter-server
|
||||
jupyterlab-pygments==0.2.2
|
||||
# via nbconvert
|
||||
jupyterlab-widgets==3.0.5
|
||||
jupyterlab-widgets==3.0.6
|
||||
# via ipywidgets
|
||||
markupsafe==2.1.2
|
||||
# via
|
||||
@ -156,13 +155,13 @@ nbclassic==0.5.3
|
||||
# via notebook
|
||||
nbclient==0.7.2
|
||||
# via nbconvert
|
||||
nbconvert==7.2.9
|
||||
nbconvert==7.2.10
|
||||
# via
|
||||
# jupyter
|
||||
# jupyter-server
|
||||
# nbclassic
|
||||
# notebook
|
||||
nbformat==5.7.3
|
||||
nbformat==5.8.0
|
||||
# via
|
||||
# jupyter-server
|
||||
# nbclassic
|
||||
@ -186,6 +185,7 @@ packaging==23.0
|
||||
# ipykernel
|
||||
# jupyter-server
|
||||
# nbconvert
|
||||
# qtconsole
|
||||
# qtpy
|
||||
pandocfilters==1.5.0
|
||||
# via nbconvert
|
||||
@ -203,7 +203,7 @@ platformdirs==3.1.1
|
||||
# via
|
||||
# jupyter-core
|
||||
# virtualenv
|
||||
pre-commit==3.1.1
|
||||
pre-commit==3.2.0
|
||||
# via -r requirements/dev.in
|
||||
prometheus-client==0.16.0
|
||||
# via
|
||||
@ -244,7 +244,7 @@ pyyaml==6.0
|
||||
# via
|
||||
# jupyter-events
|
||||
# pre-commit
|
||||
pyzmq==25.0.0
|
||||
pyzmq==25.0.2
|
||||
# via
|
||||
# ipykernel
|
||||
# jupyter-client
|
||||
@ -253,7 +253,7 @@ pyzmq==25.0.0
|
||||
# nbclassic
|
||||
# notebook
|
||||
# qtconsole
|
||||
qtconsole==5.4.0
|
||||
qtconsole==5.4.1
|
||||
# via jupyter
|
||||
qtpy==2.3.0
|
||||
# via qtconsole
|
||||
@ -322,7 +322,7 @@ traitlets==5.9.0
|
||||
# qtconsole
|
||||
uri-template==1.2.0
|
||||
# via jsonschema
|
||||
virtualenv==20.20.0
|
||||
virtualenv==20.21.0
|
||||
# via pre-commit
|
||||
wcwidth==0.2.6
|
||||
# via prompt-toolkit
|
||||
@ -334,11 +334,11 @@ webencodings==0.5.1
|
||||
# tinycss2
|
||||
websocket-client==1.5.1
|
||||
# via jupyter-server
|
||||
wheel==0.38.4
|
||||
wheel==0.40.0
|
||||
# via
|
||||
# -r requirements/dev.in
|
||||
# pip-tools
|
||||
widgetsnbextension==4.0.5
|
||||
widgetsnbextension==4.0.6
|
||||
# via ipywidgets
|
||||
zipp==3.15.0
|
||||
# via
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#
|
||||
anyio==3.6.2
|
||||
# via httpcore
|
||||
argilla==1.4.0
|
||||
argilla==1.5.0
|
||||
# via unstructured (setup.py)
|
||||
backoff==2.2.1
|
||||
# via argilla
|
||||
@ -28,9 +28,10 @@ deprecated==1.2.13
|
||||
# via argilla
|
||||
et-xmlfile==1.1.0
|
||||
# via openpyxl
|
||||
filelock==3.9.0
|
||||
filelock==3.10.3
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
# transformers
|
||||
h11==0.14.0
|
||||
# via httpcore
|
||||
@ -38,15 +39,17 @@ httpcore==0.16.3
|
||||
# via httpx
|
||||
httpx==0.23.3
|
||||
# via argilla
|
||||
huggingface-hub==0.13.1
|
||||
huggingface-hub==0.13.3
|
||||
# via transformers
|
||||
idna==3.4
|
||||
# via
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via markdown
|
||||
jinja2==3.1.2
|
||||
# via torch
|
||||
joblib==1.2.0
|
||||
# via
|
||||
# nltk
|
||||
@ -58,10 +61,16 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
markdown==3.4.3
|
||||
# via unstructured (setup.py)
|
||||
markupsafe==2.1.2
|
||||
# via jinja2
|
||||
monotonic==1.6
|
||||
# via argilla
|
||||
mpmath==1.3.0
|
||||
# via sympy
|
||||
networkx==3.0
|
||||
# via torch
|
||||
nltk==3.8.1
|
||||
# via unstructured (setup.py)
|
||||
numpy==1.23.5
|
||||
@ -69,7 +78,7 @@ numpy==1.23.5
|
||||
# argilla
|
||||
# pandas
|
||||
# transformers
|
||||
openpyxl==3.1.1
|
||||
openpyxl==3.1.2
|
||||
# via unstructured (setup.py)
|
||||
packaging==23.0
|
||||
# via
|
||||
@ -84,10 +93,12 @@ pillow==9.4.0
|
||||
# via
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
pydantic==1.10.6
|
||||
pydantic==1.10.7
|
||||
# via argilla
|
||||
pygments==2.14.0
|
||||
# via rich
|
||||
pypandoc==1.11
|
||||
# via unstructured (setup.py)
|
||||
python-dateutil==2.8.2
|
||||
# via pandas
|
||||
python-docx==0.8.11
|
||||
@ -102,7 +113,7 @@ pyyaml==6.0
|
||||
# via
|
||||
# huggingface-hub
|
||||
# transformers
|
||||
regex==2022.10.31
|
||||
regex==2023.3.23
|
||||
# via
|
||||
# nltk
|
||||
# sacremoses
|
||||
@ -130,9 +141,11 @@ sniffio==1.3.0
|
||||
# anyio
|
||||
# httpcore
|
||||
# httpx
|
||||
sympy==1.11.1
|
||||
# via torch
|
||||
tokenizers==0.13.2
|
||||
# via transformers
|
||||
torch==1.13.1
|
||||
torch==2.0.0
|
||||
# via unstructured (setup.py)
|
||||
tqdm==4.65.0
|
||||
# via
|
||||
@ -141,7 +154,7 @@ tqdm==4.65.0
|
||||
# nltk
|
||||
# sacremoses
|
||||
# transformers
|
||||
transformers==4.26.1
|
||||
transformers==4.27.3
|
||||
# via unstructured (setup.py)
|
||||
typing-extensions==4.5.0
|
||||
# via
|
||||
@ -149,7 +162,7 @@ typing-extensions==4.5.0
|
||||
# pydantic
|
||||
# rich
|
||||
# torch
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via requests
|
||||
wrapt==1.14.1
|
||||
# via
|
||||
|
||||
@ -16,7 +16,7 @@ anyio==3.6.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
argilla==1.3.1
|
||||
argilla==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -50,7 +50,7 @@ cffi==1.15.1
|
||||
# via
|
||||
# azure-datalake-store
|
||||
# cryptography
|
||||
charset-normalizer==3.0.1
|
||||
charset-normalizer==3.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# aiohttp
|
||||
@ -59,7 +59,11 @@ click==8.1.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
cryptography==39.0.1
|
||||
commonmark==0.9.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# rich
|
||||
cryptography==39.0.2
|
||||
# via
|
||||
# adal
|
||||
# azure-identity
|
||||
@ -78,7 +82,7 @@ frozenlist==1.3.3
|
||||
# via
|
||||
# aiohttp
|
||||
# aiosignal
|
||||
fsspec==2023.1.0
|
||||
fsspec==2023.3.0
|
||||
# via
|
||||
# adlfs
|
||||
# unstructured (setup.py)
|
||||
@ -101,7 +105,7 @@ idna==3.4
|
||||
# requests
|
||||
# rfc3986
|
||||
# yarl
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# markdown
|
||||
@ -117,7 +121,7 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
markdown==3.4.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -144,7 +148,7 @@ numpy==1.23.5
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pandas
|
||||
openpyxl==3.1.1
|
||||
openpyxl==3.1.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -166,14 +170,22 @@ portalocker==2.7.0
|
||||
# via msal-extensions
|
||||
pycparser==2.21
|
||||
# via cffi
|
||||
pydantic==1.10.5
|
||||
pydantic==1.10.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
pygments==2.14.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# rich
|
||||
pyjwt[crypto]==2.6.0
|
||||
# via
|
||||
# adal
|
||||
# msal
|
||||
pypandoc==1.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
@ -195,7 +207,7 @@ pytz==2022.7.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
regex==2022.10.31
|
||||
regex==2023.3.23
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
@ -211,6 +223,10 @@ rfc3986[idna2008]==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpx
|
||||
rich==13.0.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
six==1.16.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
@ -224,7 +240,7 @@ sniffio==1.3.0
|
||||
# anyio
|
||||
# httpcore
|
||||
# httpx
|
||||
tqdm==4.64.1
|
||||
tqdm==4.65.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
@ -235,7 +251,8 @@ typing-extensions==4.5.0
|
||||
# azure-core
|
||||
# azure-storage-blob
|
||||
# pydantic
|
||||
urllib3==1.26.14
|
||||
# rich
|
||||
urllib3==1.26.15
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
@ -244,7 +261,7 @@ wrapt==1.14.1
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# deprecated
|
||||
xlsxwriter==3.0.8
|
||||
xlsxwriter==3.0.9
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
||||
|
||||
@ -8,7 +8,7 @@ anyio==3.6.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
argilla==1.4.0
|
||||
argilla==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -64,7 +64,7 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# markdown
|
||||
@ -78,7 +78,7 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
markdown==3.4.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -95,7 +95,7 @@ numpy==1.23.5
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pandas
|
||||
openpyxl==3.1.1
|
||||
openpyxl==3.1.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -115,7 +115,7 @@ pillow==9.4.0
|
||||
# unstructured (setup.py)
|
||||
pycparser==2.21
|
||||
# via cffi
|
||||
pydantic==1.10.6
|
||||
pydantic==1.10.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
@ -129,6 +129,10 @@ pyjwt==2.6.0
|
||||
# via pygithub
|
||||
pynacl==1.5.0
|
||||
# via pygithub
|
||||
pypandoc==1.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
@ -149,7 +153,7 @@ pytz==2022.7.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
regex==2022.10.31
|
||||
regex==2023.3.23
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
@ -186,7 +190,7 @@ typing-extensions==4.5.0
|
||||
# -r requirements/base.txt
|
||||
# pydantic
|
||||
# rich
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
|
||||
@ -8,7 +8,7 @@ anyio==3.6.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
argilla==1.4.0
|
||||
argilla==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -61,7 +61,7 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# markdown
|
||||
@ -75,7 +75,7 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
markdown==3.4.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -92,7 +92,7 @@ numpy==1.23.5
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pandas
|
||||
openpyxl==3.1.1
|
||||
openpyxl==3.1.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -110,7 +110,7 @@ pillow==9.4.0
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
pydantic==1.10.6
|
||||
pydantic==1.10.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
@ -118,6 +118,10 @@ pygments==2.14.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# rich
|
||||
pypandoc==1.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
@ -140,7 +144,7 @@ pytz==2022.7.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
regex==2022.10.31
|
||||
regex==2023.3.23
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
@ -180,7 +184,7 @@ typing-extensions==4.5.0
|
||||
# -r requirements/base.txt
|
||||
# pydantic
|
||||
# rich
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
|
||||
@ -8,7 +8,7 @@ anyio==3.6.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
argilla==1.4.0
|
||||
argilla==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -47,7 +47,7 @@ et-xmlfile==1.1.0
|
||||
# openpyxl
|
||||
google-api-core==2.11.0
|
||||
# via google-api-python-client
|
||||
google-api-python-client==2.80.0
|
||||
google-api-python-client==2.82.0
|
||||
# via unstructured (setup.py)
|
||||
google-auth==2.16.2
|
||||
# via
|
||||
@ -56,7 +56,7 @@ google-auth==2.16.2
|
||||
# google-auth-httplib2
|
||||
google-auth-httplib2==0.1.0
|
||||
# via google-api-python-client
|
||||
googleapis-common-protos==1.58.0
|
||||
googleapis-common-protos==1.59.0
|
||||
# via google-api-core
|
||||
h11==0.14.0
|
||||
# via
|
||||
@ -66,7 +66,7 @@ httpcore==0.16.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpx
|
||||
httplib2==0.21.0
|
||||
httplib2==0.22.0
|
||||
# via
|
||||
# google-api-python-client
|
||||
# google-auth-httplib2
|
||||
@ -80,7 +80,7 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# markdown
|
||||
@ -94,7 +94,7 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
markdown==3.4.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -111,7 +111,7 @@ numpy==1.23.5
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pandas
|
||||
openpyxl==3.1.1
|
||||
openpyxl==3.1.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -139,7 +139,7 @@ pyasn1==0.4.8
|
||||
# rsa
|
||||
pyasn1-modules==0.2.8
|
||||
# via google-auth
|
||||
pydantic==1.10.6
|
||||
pydantic==1.10.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
@ -147,6 +147,10 @@ pygments==2.14.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# rich
|
||||
pypandoc==1.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
pyparsing==3.0.9
|
||||
# via httplib2
|
||||
python-dateutil==2.8.2
|
||||
@ -169,7 +173,7 @@ pytz==2022.7.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
regex==2022.10.31
|
||||
regex==2023.3.23
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
@ -212,7 +216,7 @@ typing-extensions==4.5.0
|
||||
# rich
|
||||
uritemplate==4.1.1
|
||||
# via google-api-python-client
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
|
||||
@ -8,7 +8,7 @@ anyio==3.6.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
argilla==1.4.0
|
||||
argilla==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -61,7 +61,7 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# markdown
|
||||
@ -75,7 +75,7 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
markdown==3.4.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -92,7 +92,7 @@ numpy==1.23.5
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pandas
|
||||
openpyxl==3.1.1
|
||||
openpyxl==3.1.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -114,7 +114,7 @@ praw==7.7.0
|
||||
# via unstructured (setup.py)
|
||||
prawcore==2.3.0
|
||||
# via praw
|
||||
pydantic==1.10.6
|
||||
pydantic==1.10.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
@ -122,6 +122,10 @@ pygments==2.14.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# rich
|
||||
pypandoc==1.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
@ -142,7 +146,7 @@ pytz==2022.7.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
regex==2022.10.31
|
||||
regex==2023.3.23
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
@ -182,7 +186,7 @@ typing-extensions==4.5.0
|
||||
# rich
|
||||
update-checker==0.18.0
|
||||
# via praw
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
|
||||
@ -18,7 +18,7 @@ anyio==3.6.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
argilla==1.4.0
|
||||
argilla==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -87,7 +87,7 @@ idna==3.4
|
||||
# requests
|
||||
# rfc3986
|
||||
# yarl
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# markdown
|
||||
@ -103,7 +103,7 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
markdown==3.4.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -124,7 +124,7 @@ numpy==1.23.5
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pandas
|
||||
openpyxl==3.1.1
|
||||
openpyxl==3.1.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -142,7 +142,7 @@ pillow==9.4.0
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
pydantic==1.10.6
|
||||
pydantic==1.10.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
@ -150,6 +150,10 @@ pygments==2.14.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# rich
|
||||
pypandoc==1.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
@ -171,7 +175,7 @@ pytz==2022.7.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
regex==2022.10.31
|
||||
regex==2023.3.23
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
@ -210,7 +214,7 @@ typing-extensions==4.5.0
|
||||
# aioitertools
|
||||
# pydantic
|
||||
# rich
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# botocore
|
||||
|
||||
@ -8,7 +8,7 @@ anyio==3.6.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# httpcore
|
||||
argilla==1.4.0
|
||||
argilla==1.5.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -16,7 +16,7 @@ backoff==2.2.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
beautifulsoup4==4.11.2
|
||||
beautifulsoup4==4.12.0
|
||||
# via wikipedia
|
||||
certifi==2022.12.7
|
||||
# via
|
||||
@ -63,7 +63,7 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# markdown
|
||||
@ -77,7 +77,7 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
markdown==3.4.3
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -94,7 +94,7 @@ numpy==1.23.5
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
# pandas
|
||||
openpyxl==3.1.1
|
||||
openpyxl==3.1.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
@ -112,7 +112,7 @@ pillow==9.4.0
|
||||
# -r requirements/base.txt
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
pydantic==1.10.6
|
||||
pydantic==1.10.7
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# argilla
|
||||
@ -120,6 +120,10 @@ pygments==2.14.0
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# rich
|
||||
pypandoc==1.11
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# unstructured (setup.py)
|
||||
python-dateutil==2.8.2
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
@ -140,7 +144,7 @@ pytz==2022.7.1
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# pandas
|
||||
regex==2022.10.31
|
||||
regex==2023.3.23
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# nltk
|
||||
@ -179,7 +183,7 @@ typing-extensions==4.5.0
|
||||
# -r requirements/base.txt
|
||||
# pydantic
|
||||
# rich
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via
|
||||
# -r requirements/base.txt
|
||||
# requests
|
||||
|
||||
@ -10,7 +10,7 @@ anyio==3.6.2
|
||||
# via
|
||||
# httpcore
|
||||
# starlette
|
||||
argilla==1.4.0
|
||||
argilla==1.5.0
|
||||
# via unstructured (setup.py)
|
||||
backoff==2.2.1
|
||||
# via argilla
|
||||
@ -46,15 +46,16 @@ effdet==0.3.0
|
||||
# via layoutparser
|
||||
et-xmlfile==1.1.0
|
||||
# via openpyxl
|
||||
fastapi==0.94.0
|
||||
fastapi==0.95.0
|
||||
# via unstructured-inference
|
||||
filelock==3.9.0
|
||||
filelock==3.10.3
|
||||
# via
|
||||
# huggingface-hub
|
||||
# torch
|
||||
# transformers
|
||||
flatbuffers==23.3.3
|
||||
# via onnxruntime
|
||||
fonttools==4.39.0
|
||||
fonttools==4.39.2
|
||||
# via matplotlib
|
||||
h11==0.14.0
|
||||
# via
|
||||
@ -64,7 +65,7 @@ httpcore==0.16.3
|
||||
# via httpx
|
||||
httpx==0.23.3
|
||||
# via argilla
|
||||
huggingface-hub==0.13.1
|
||||
huggingface-hub==0.13.3
|
||||
# via
|
||||
# timm
|
||||
# transformers
|
||||
@ -76,12 +77,14 @@ idna==3.4
|
||||
# anyio
|
||||
# requests
|
||||
# rfc3986
|
||||
importlib-metadata==6.0.0
|
||||
importlib-metadata==6.1.0
|
||||
# via markdown
|
||||
importlib-resources==5.12.0
|
||||
# via matplotlib
|
||||
iopath==0.1.10
|
||||
# via layoutparser
|
||||
jinja2==3.1.2
|
||||
# via torch
|
||||
joblib==1.2.0
|
||||
# via nltk
|
||||
kiwisolver==1.4.4
|
||||
@ -93,14 +96,18 @@ lxml==4.9.2
|
||||
# python-docx
|
||||
# python-pptx
|
||||
# unstructured (setup.py)
|
||||
markdown==3.4.1
|
||||
markdown==3.4.3
|
||||
# via unstructured (setup.py)
|
||||
markupsafe==2.1.2
|
||||
# via jinja2
|
||||
matplotlib==3.7.1
|
||||
# via pycocotools
|
||||
monotonic==1.6
|
||||
# via argilla
|
||||
mpmath==1.3.0
|
||||
# via sympy
|
||||
networkx==3.0
|
||||
# via torch
|
||||
nltk==3.8.1
|
||||
# via unstructured (setup.py)
|
||||
numpy==1.23.5
|
||||
@ -124,7 +131,7 @@ opencv-python==4.6.0.66
|
||||
# via
|
||||
# layoutparser
|
||||
# unstructured-inference
|
||||
openpyxl==3.1.1
|
||||
openpyxl==3.1.2
|
||||
# via unstructured (setup.py)
|
||||
packaging==23.0
|
||||
# via
|
||||
@ -163,12 +170,14 @@ pycocotools==2.0.6
|
||||
# via effdet
|
||||
pycparser==2.21
|
||||
# via cffi
|
||||
pydantic==1.10.6
|
||||
pydantic==1.10.7
|
||||
# via
|
||||
# argilla
|
||||
# fastapi
|
||||
pygments==2.14.0
|
||||
# via rich
|
||||
pypandoc==1.11
|
||||
# via unstructured (setup.py)
|
||||
pyparsing==3.0.9
|
||||
# via matplotlib
|
||||
pytesseract==0.3.10
|
||||
@ -194,7 +203,7 @@ pyyaml==6.0
|
||||
# omegaconf
|
||||
# timm
|
||||
# transformers
|
||||
regex==2022.10.31
|
||||
regex==2023.3.23
|
||||
# via
|
||||
# nltk
|
||||
# transformers
|
||||
@ -217,21 +226,23 @@ sniffio==1.3.0
|
||||
# anyio
|
||||
# httpcore
|
||||
# httpx
|
||||
starlette==0.26.0.post1
|
||||
starlette==0.26.1
|
||||
# via fastapi
|
||||
sympy==1.11.1
|
||||
# via onnxruntime
|
||||
# via
|
||||
# onnxruntime
|
||||
# torch
|
||||
timm==0.6.12
|
||||
# via effdet
|
||||
tokenizers==0.13.2
|
||||
# via transformers
|
||||
torch==1.13.1
|
||||
torch==2.0.0
|
||||
# via
|
||||
# effdet
|
||||
# layoutparser
|
||||
# timm
|
||||
# torchvision
|
||||
torchvision==0.14.1
|
||||
torchvision==0.15.1
|
||||
# via
|
||||
# effdet
|
||||
# layoutparser
|
||||
@ -243,7 +254,7 @@ tqdm==4.65.0
|
||||
# iopath
|
||||
# nltk
|
||||
# transformers
|
||||
transformers==4.26.1
|
||||
transformers==4.27.3
|
||||
# via unstructured-inference
|
||||
typing-extensions==4.5.0
|
||||
# via
|
||||
@ -253,12 +264,11 @@ typing-extensions==4.5.0
|
||||
# rich
|
||||
# starlette
|
||||
# torch
|
||||
# torchvision
|
||||
unstructured-inference==0.2.11
|
||||
# via unstructured (setup.py)
|
||||
urllib3==1.26.14
|
||||
urllib3==1.26.15
|
||||
# via requests
|
||||
uvicorn==0.21.0
|
||||
uvicorn==0.21.1
|
||||
# via unstructured-inference
|
||||
wand==0.6.11
|
||||
# via pdfplumber
|
||||
|
||||
@ -8,6 +8,7 @@ flake8
|
||||
mypy
|
||||
types-Markdown
|
||||
pytest-cov
|
||||
pytest-mock
|
||||
label_studio_sdk
|
||||
types-requests
|
||||
vcrpy
|
||||
|
||||
@ -4,9 +4,6 @@
|
||||
#
|
||||
# pip-compile requirements/test.in
|
||||
#
|
||||
--extra-index-url https://pypi.ngc.nvidia.com
|
||||
--trusted-host pypi.ngc.nvidia.com
|
||||
|
||||
appdirs==1.4.4
|
||||
# via label-studio-tools
|
||||
attrs==22.2.0
|
||||
@ -23,7 +20,7 @@ click==8.1.3
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# black
|
||||
coverage[toml]==7.2.1
|
||||
coverage[toml]==7.2.2
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# pytest-cov
|
||||
@ -67,19 +64,23 @@ pluggy==1.0.0
|
||||
# via pytest
|
||||
pycodestyle==2.10.0
|
||||
# via flake8
|
||||
pydantic==1.10.6
|
||||
pydantic==1.10.7
|
||||
# via label-studio-sdk
|
||||
pyflakes==3.0.1
|
||||
# via flake8
|
||||
pytest==7.2.2
|
||||
# via pytest-cov
|
||||
# via
|
||||
# pytest-cov
|
||||
# pytest-mock
|
||||
pytest-cov==4.0.0
|
||||
# via -r requirements/test.in
|
||||
pytest-mock==3.10.0
|
||||
# via -r requirements/test.in
|
||||
pyyaml==6.0
|
||||
# via vcrpy
|
||||
requests==2.28.2
|
||||
# via label-studio-sdk
|
||||
ruff==0.0.256
|
||||
ruff==0.0.259
|
||||
# via -r requirements/test.in
|
||||
six==1.16.0
|
||||
# via vcrpy
|
||||
@ -91,7 +92,7 @@ tomli==2.0.1
|
||||
# pytest
|
||||
types-markdown==3.4.2.5
|
||||
# via -r requirements/test.in
|
||||
types-requests==2.28.11.15
|
||||
types-requests==2.28.11.16
|
||||
# via -r requirements/test.in
|
||||
types-urllib3==1.26.25.8
|
||||
# via types-requests
|
||||
|
||||
@ -4,6 +4,7 @@ from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
from requests.models import Response
|
||||
|
||||
from unstructured.documents.elements import PageBreak
|
||||
from unstructured.partition.html import partition_html
|
||||
@ -86,6 +87,25 @@ def test_partition_html_from_url_raises_with_bad_content_type():
|
||||
partition_html(url="https://fake.url")
|
||||
|
||||
|
||||
def test_partition_from_url_uses_headers(mocker):
|
||||
test_url = "https://example.com"
|
||||
test_headers = {"User-Agent": "test"}
|
||||
|
||||
response = Response()
|
||||
response.status_code = 200
|
||||
response._content = (
|
||||
b"<html><head></head><body><p>What do i know? Who needs to know it?</p></body></html>"
|
||||
)
|
||||
response.headers = {"Content-Type": "text/html"}
|
||||
|
||||
mock_get = mocker.patch("requests.get", return_value=response)
|
||||
|
||||
partition_html(url=test_url, headers=test_headers)
|
||||
|
||||
# Check if requests.get was called with the correct arguments
|
||||
mock_get.assert_called_once_with(test_url, headers=test_headers)
|
||||
|
||||
|
||||
def test_partition_html_raises_with_none_specified():
|
||||
with pytest.raises(ValueError):
|
||||
partition_html()
|
||||
|
||||
@ -1 +1 @@
|
||||
__version__ = "0.5.7-dev2" # pragma: no cover
|
||||
__version__ = "0.5.7-dev3" # pragma: no cover
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
from typing import IO, List, Optional
|
||||
from typing import IO, Dict, List, Optional
|
||||
|
||||
import requests
|
||||
|
||||
@ -20,6 +20,7 @@ def partition_html(
|
||||
encoding: Optional[str] = None,
|
||||
include_page_breaks: bool = False,
|
||||
include_metadata: bool = True,
|
||||
headers: Dict[str, str] = {},
|
||||
parser: VALID_PARSERS = None,
|
||||
) -> List[Element]:
|
||||
"""Partitions an HTML document into its constituent elements.
|
||||
@ -67,7 +68,7 @@ def partition_html(
|
||||
document = HTMLDocument.from_string(_text, parser=parser)
|
||||
|
||||
elif url is not None:
|
||||
response = requests.get(url)
|
||||
response = requests.get(url, headers=headers)
|
||||
if not response.ok:
|
||||
raise ValueError(f"URL return an error: {response.status_code}")
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user