mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-10-29 17:04:37 +00:00
build: remove ruff version upper bound (#3829)
**Summary** Remove pin on `ruff` linter and fix the handful of lint errors a newer version catches.
This commit is contained in:
parent
b092fb7f47
commit
10f0d54ac2
@ -1,4 +1,4 @@
|
|||||||
## 0.16.12-dev1
|
## 0.16.12-dev2
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
@ -8,6 +8,8 @@
|
|||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
|
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
|
||||||
|
|
||||||
## 0.16.11
|
## 0.16.11
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|||||||
@ -12,9 +12,17 @@ verboseOutput = true
|
|||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
line-length = 100
|
line-length = 100
|
||||||
|
target-version = "py39"
|
||||||
|
|
||||||
# -- changes made here should also be made in `.pre-commit-config.yaml` and `Makefile` --
|
[tool.ruff.lint]
|
||||||
lint.select = [
|
ignore = [
|
||||||
|
"COM812", # -- over aggressively insists on trailing commas where not desireable --
|
||||||
|
"PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) --
|
||||||
|
"PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception --
|
||||||
|
"PT012", # -- pytest.raises() block should contain a single simple statement --
|
||||||
|
"SIM117", # -- merge `with` statements for context managers that have same scope --
|
||||||
|
]
|
||||||
|
select = [
|
||||||
"C4", # -- flake8-comprehensions --
|
"C4", # -- flake8-comprehensions --
|
||||||
"COM", # -- flake8-commas --
|
"COM", # -- flake8-commas --
|
||||||
"E", # -- pycodestyle errors --
|
"E", # -- pycodestyle errors --
|
||||||
@ -29,11 +37,3 @@ lint.select = [
|
|||||||
"UP034", # -- Avoid extraneous parentheses --
|
"UP034", # -- Avoid extraneous parentheses --
|
||||||
"W", # -- Warnings, including invalid escape-sequence --
|
"W", # -- Warnings, including invalid escape-sequence --
|
||||||
]
|
]
|
||||||
lint.ignore = [
|
|
||||||
"COM812", # -- over aggressively insists on trailing commas where not desireable --
|
|
||||||
"PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) --
|
|
||||||
"PT005", # -- flags mock fixtures with names intentionally matching private method name --
|
|
||||||
"PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception --
|
|
||||||
"PT012", # -- pytest.raises() block should contain a single simple statement --
|
|
||||||
"SIM117", # -- merge `with` statements for context managers that have same scope --
|
|
||||||
]
|
|
||||||
|
|||||||
@ -11,9 +11,7 @@ mypy
|
|||||||
pydantic
|
pydantic
|
||||||
pytest-cov
|
pytest-cov
|
||||||
pytest-mock
|
pytest-mock
|
||||||
# NOTE(robison) - we need to do additional cleanup to pass
|
ruff
|
||||||
# linting for the latest version of ruff
|
|
||||||
ruff<0.5.0
|
|
||||||
types-Markdown
|
types-Markdown
|
||||||
types-requests
|
types-requests
|
||||||
types-tabulate
|
types-tabulate
|
||||||
|
|||||||
@ -171,7 +171,7 @@ requests==2.32.3
|
|||||||
# requests-mock
|
# requests-mock
|
||||||
requests-mock==1.12.1
|
requests-mock==1.12.1
|
||||||
# via label-studio-sdk
|
# via label-studio-sdk
|
||||||
ruff==0.4.10
|
ruff==0.8.3
|
||||||
# via -r ./test.in
|
# via -r ./test.in
|
||||||
semantic-version==2.10.0
|
semantic-version==2.10.0
|
||||||
# via liccheck
|
# via liccheck
|
||||||
|
|||||||
@ -245,12 +245,14 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
|
|||||||
_test(result)
|
_test(result)
|
||||||
else:
|
else:
|
||||||
with open(filename, "rb") as test_file:
|
with open(filename, "rb") as test_file:
|
||||||
spooled_temp_file = SpooledTemporaryFile()
|
with SpooledTemporaryFile() as spooled_temp_file:
|
||||||
spooled_temp_file.write(test_file.read())
|
spooled_temp_file.write(test_file.read())
|
||||||
spooled_temp_file.seek(0)
|
spooled_temp_file.seek(0)
|
||||||
result = pdf.partition_pdf(
|
result = pdf.partition_pdf(
|
||||||
file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
|
file=spooled_temp_file,
|
||||||
)
|
strategy=strategy,
|
||||||
|
starting_page_number=starting_page_number,
|
||||||
|
)
|
||||||
_test(result)
|
_test(result)
|
||||||
|
|
||||||
|
|
||||||
@ -757,14 +759,14 @@ def test_partition_pdf_metadata_date(
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
with open(filename, "rb") as test_file:
|
with open(filename, "rb") as test_file:
|
||||||
spooled_temp_file = SpooledTemporaryFile()
|
with SpooledTemporaryFile() as spooled_temp_file:
|
||||||
spooled_temp_file.write(test_file.read())
|
spooled_temp_file.write(test_file.read())
|
||||||
spooled_temp_file.seek(0)
|
spooled_temp_file.seek(0)
|
||||||
elements = pdf.partition_pdf(
|
elements = pdf.partition_pdf(
|
||||||
file=spooled_temp_file,
|
file=spooled_temp_file,
|
||||||
strategy=strategy,
|
strategy=strategy,
|
||||||
metadata_last_modified=metadata_last_modified,
|
metadata_last_modified=metadata_last_modified,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert {el.metadata.last_modified for el in elements} == {expected_last_modified}
|
assert {el.metadata.last_modified for el in elements} == {expected_last_modified}
|
||||||
|
|
||||||
@ -1131,15 +1133,15 @@ def test_partition_pdf_with_ocr_only_strategy(
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
with open(filename, "rb") as test_file:
|
with open(filename, "rb") as test_file:
|
||||||
spooled_temp_file = SpooledTemporaryFile()
|
with SpooledTemporaryFile() as spooled_temp_file:
|
||||||
spooled_temp_file.write(test_file.read())
|
spooled_temp_file.write(test_file.read())
|
||||||
spooled_temp_file.seek(0)
|
spooled_temp_file.seek(0)
|
||||||
elements = pdf.partition_pdf(
|
elements = pdf.partition_pdf(
|
||||||
file=spooled_temp_file,
|
file=spooled_temp_file,
|
||||||
strategy=PartitionStrategy.OCR_ONLY,
|
strategy=PartitionStrategy.OCR_ONLY,
|
||||||
languages=["eng"],
|
languages=["eng"],
|
||||||
is_image=is_image,
|
is_image=is_image,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert elements[0].metadata.languages == ["eng"]
|
assert elements[0].metadata.languages == ["eng"]
|
||||||
# check pages
|
# check pages
|
||||||
|
|||||||
@ -77,14 +77,15 @@ def test_partition_docx_with_spooled_file(
|
|||||||
`python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need
|
`python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need
|
||||||
to ensure the source file is appropriately converted in this case.
|
to ensure the source file is appropriately converted in this case.
|
||||||
"""
|
"""
|
||||||
with open(mock_document_file_path, "rb") as test_file:
|
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
|
||||||
spooled_temp_file = tempfile.SpooledTemporaryFile()
|
with open(mock_document_file_path, "rb") as test_file:
|
||||||
spooled_temp_file.write(test_file.read())
|
spooled_temp_file.write(test_file.read())
|
||||||
spooled_temp_file.seek(0)
|
spooled_temp_file.seek(0)
|
||||||
|
|
||||||
elements = partition_docx(file=spooled_temp_file)
|
elements = partition_docx(file=spooled_temp_file)
|
||||||
assert elements == expected_elements
|
|
||||||
for element in elements:
|
assert elements == expected_elements
|
||||||
assert element.metadata.filename is None
|
assert all(e.metadata.filename is None for e in elements)
|
||||||
|
|
||||||
|
|
||||||
def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]):
|
def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]):
|
||||||
@ -921,16 +922,16 @@ class DescribeDocxPartitionerOptions:
|
|||||||
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
|
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
|
||||||
self, opts_args: dict[str, Any]
|
self, opts_args: dict[str, Any]
|
||||||
):
|
):
|
||||||
spooled_temp_file = tempfile.SpooledTemporaryFile()
|
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
|
||||||
spooled_temp_file.write(b"abcdefg")
|
spooled_temp_file.write(b"abcdefg")
|
||||||
opts_args["file"] = spooled_temp_file
|
opts_args["file"] = spooled_temp_file
|
||||||
opts = DocxPartitionerOptions(**opts_args)
|
opts = DocxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
docx_file = opts._docx_file
|
docx_file = opts._docx_file
|
||||||
|
|
||||||
assert docx_file is not spooled_temp_file
|
assert docx_file is not spooled_temp_file
|
||||||
assert isinstance(docx_file, io.BytesIO)
|
assert isinstance(docx_file, io.BytesIO)
|
||||||
assert docx_file.getvalue() == b"abcdefg"
|
assert docx_file.getvalue() == b"abcdefg"
|
||||||
|
|
||||||
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
|
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
|
||||||
self, opts_args: dict[str, Any]
|
self, opts_args: dict[str, Any]
|
||||||
|
|||||||
@ -74,10 +74,12 @@ def test_partition_pptx_with_spooled_file():
|
|||||||
|
|
||||||
Including one that does not have its read-pointer set to the start.
|
Including one that does not have its read-pointer set to the start.
|
||||||
"""
|
"""
|
||||||
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
|
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
|
||||||
spooled_temp_file = tempfile.SpooledTemporaryFile()
|
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
|
||||||
spooled_temp_file.write(test_file.read())
|
spooled_temp_file.write(test_file.read())
|
||||||
|
|
||||||
elements = partition_pptx(file=spooled_temp_file)
|
elements = partition_pptx(file=spooled_temp_file)
|
||||||
|
|
||||||
assert elements == EXPECTED_PPTX_OUTPUT
|
assert elements == EXPECTED_PPTX_OUTPUT
|
||||||
for element in elements:
|
for element in elements:
|
||||||
assert element.metadata.filename is None
|
assert element.metadata.filename is None
|
||||||
@ -701,16 +703,16 @@ class DescribePptxPartitionerOptions:
|
|||||||
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
|
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
|
||||||
self, opts_args: dict[str, Any]
|
self, opts_args: dict[str, Any]
|
||||||
):
|
):
|
||||||
spooled_temp_file = tempfile.SpooledTemporaryFile()
|
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
|
||||||
spooled_temp_file.write(b"abcdefg")
|
spooled_temp_file.write(b"abcdefg")
|
||||||
opts_args["file"] = spooled_temp_file
|
opts_args["file"] = spooled_temp_file
|
||||||
opts = PptxPartitionerOptions(**opts_args)
|
opts = PptxPartitionerOptions(**opts_args)
|
||||||
|
|
||||||
pptx_file = opts.pptx_file
|
pptx_file = opts.pptx_file
|
||||||
|
|
||||||
assert pptx_file is not spooled_temp_file
|
assert pptx_file is not spooled_temp_file
|
||||||
assert isinstance(pptx_file, io.BytesIO)
|
assert isinstance(pptx_file, io.BytesIO)
|
||||||
assert pptx_file.getvalue() == b"abcdefg"
|
assert pptx_file.getvalue() == b"abcdefg"
|
||||||
|
|
||||||
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
|
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
|
||||||
self, opts_args: dict[str, Any]
|
self, opts_args: dict[str, Any]
|
||||||
|
|||||||
@ -64,10 +64,12 @@ def test_partition_xlsx_from_filename():
|
|||||||
|
|
||||||
|
|
||||||
def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji():
|
def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji():
|
||||||
f = tempfile.SpooledTemporaryFile()
|
with tempfile.SpooledTemporaryFile() as f:
|
||||||
with open("example-docs/emoji.xlsx", "rb") as g:
|
with open("example-docs/emoji.xlsx", "rb") as g:
|
||||||
f.write(g.read())
|
f.write(g.read())
|
||||||
elements = partition_xlsx(file=f, include_header=False)
|
|
||||||
|
elements = partition_xlsx(file=f, include_header=False)
|
||||||
|
|
||||||
assert sum(isinstance(element, Text) for element in elements) == 1
|
assert sum(isinstance(element, Text) for element in elements) == 1
|
||||||
assert len(elements) == 1
|
assert len(elements) == 1
|
||||||
assert clean_extra_whitespace(elements[0].text) == "🤠😅"
|
assert clean_extra_whitespace(elements[0].text) == "🤠😅"
|
||||||
|
|||||||
@ -1 +1 @@
|
|||||||
__version__ = "0.16.12-dev1" # pragma: no cover
|
__version__ = "0.16.12-dev2" # pragma: no cover
|
||||||
|
|||||||
@ -53,7 +53,6 @@ def ontology_to_unstructured_elements(
|
|||||||
"""
|
"""
|
||||||
elements_to_return = []
|
elements_to_return = []
|
||||||
if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
|
if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
|
||||||
|
|
||||||
if page_number is None and isinstance(ontology_element, ontology.Page):
|
if page_number is None and isinstance(ontology_element, ontology.Page):
|
||||||
page_number = ontology_element.page_number
|
page_number = ontology_element.page_number
|
||||||
|
|
||||||
@ -200,10 +199,7 @@ def is_text_element(ontology_element: ontology.OntologyElement) -> bool:
|
|||||||
if any(isinstance(ontology_element, class_) for class_ in text_classes):
|
if any(isinstance(ontology_element, class_) for class_ in text_classes):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if any(ontology_element.elementType == category for category in text_categories):
|
return any(ontology_element.elementType == category for category in text_categories)
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
|
def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
|
||||||
@ -218,10 +214,7 @@ def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
|
|||||||
if any(isinstance(ontology_element, class_) for class_ in inline_classes):
|
if any(isinstance(ontology_element, class_) for class_ in inline_classes):
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if any(ontology_element.elementType == category for category in inline_categories):
|
return any(ontology_element.elementType == category for category in inline_categories)
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def unstructured_elements_to_ontology(
|
def unstructured_elements_to_ontology(
|
||||||
@ -327,10 +320,7 @@ def remove_empty_tags_from_html_content(html_content: str) -> str:
|
|||||||
if tag.attrs:
|
if tag.attrs:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if not tag.get_text(strip=True):
|
return bool(not tag.get_text(strip=True))
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def remove_empty_tags(soup):
|
def remove_empty_tags(soup):
|
||||||
for tag in soup.find_all():
|
for tag in soup.find_all():
|
||||||
@ -419,8 +409,9 @@ def extract_tag_and_ontology_class_from_tag(
|
|||||||
|
|
||||||
# Scenario 1: Valid Ontology Element
|
# Scenario 1: Valid Ontology Element
|
||||||
if soup.attrs.get("class"):
|
if soup.attrs.get("class"):
|
||||||
html_tag, element_class = soup.name, HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get(
|
html_tag, element_class = (
|
||||||
(soup.name, soup.attrs["class"][0])
|
soup.name,
|
||||||
|
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get((soup.name, soup.attrs["class"][0])),
|
||||||
)
|
)
|
||||||
|
|
||||||
# Scenario 2: HTML tag incorrect, CSS class correct
|
# Scenario 2: HTML tag incorrect, CSS class correct
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user