build: remove ruff version upper bound (#3829)

**Summary**
Remove pin on `ruff` linter and fix the handful of lint errors a newer
version catches.
This commit is contained in:
Steve Canny 2024-12-16 15:01:22 -08:00 committed by GitHub
parent b092fb7f47
commit 10f0d54ac2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 81 additions and 83 deletions

View File

@ -1,4 +1,4 @@
## 0.16.12-dev1 ## 0.16.12-dev2
### Enhancements ### Enhancements
@ -8,6 +8,8 @@
### Fixes ### Fixes
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
## 0.16.11 ## 0.16.11
### Enhancements ### Enhancements

View File

@ -12,9 +12,17 @@ verboseOutput = true
[tool.ruff] [tool.ruff]
line-length = 100 line-length = 100
target-version = "py39"
# -- changes made here should also be made in `.pre-commit-config.yaml` and `Makefile` -- [tool.ruff.lint]
lint.select = [ ignore = [
"COM812", # -- over aggressively insists on trailing commas where not desireable --
"PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) --
"PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception --
"PT012", # -- pytest.raises() block should contain a single simple statement --
"SIM117", # -- merge `with` statements for context managers that have same scope --
]
select = [
"C4", # -- flake8-comprehensions -- "C4", # -- flake8-comprehensions --
"COM", # -- flake8-commas -- "COM", # -- flake8-commas --
"E", # -- pycodestyle errors -- "E", # -- pycodestyle errors --
@ -29,11 +37,3 @@ lint.select = [
"UP034", # -- Avoid extraneous parentheses -- "UP034", # -- Avoid extraneous parentheses --
"W", # -- Warnings, including invalid escape-sequence -- "W", # -- Warnings, including invalid escape-sequence --
] ]
lint.ignore = [
"COM812", # -- over aggressively insists on trailing commas where not desireable --
"PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) --
"PT005", # -- flags mock fixtures with names intentionally matching private method name --
"PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception --
"PT012", # -- pytest.raises() block should contain a single simple statement --
"SIM117", # -- merge `with` statements for context managers that have same scope --
]

View File

@ -11,9 +11,7 @@ mypy
pydantic pydantic
pytest-cov pytest-cov
pytest-mock pytest-mock
# NOTE(robison) - we need to do additional cleanup to pass ruff
# linting for the latest version of ruff
ruff<0.5.0
types-Markdown types-Markdown
types-requests types-requests
types-tabulate types-tabulate

View File

@ -171,7 +171,7 @@ requests==2.32.3
# requests-mock # requests-mock
requests-mock==1.12.1 requests-mock==1.12.1
# via label-studio-sdk # via label-studio-sdk
ruff==0.4.10 ruff==0.8.3
# via -r ./test.in # via -r ./test.in
semantic-version==2.10.0 semantic-version==2.10.0
# via liccheck # via liccheck

View File

@ -245,12 +245,14 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
_test(result) _test(result)
else: else:
with open(filename, "rb") as test_file: with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile() with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0) spooled_temp_file.seek(0)
result = pdf.partition_pdf( result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number file=spooled_temp_file,
) strategy=strategy,
starting_page_number=starting_page_number,
)
_test(result) _test(result)
@ -757,14 +759,14 @@ def test_partition_pdf_metadata_date(
) )
else: else:
with open(filename, "rb") as test_file: with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile() with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0) spooled_temp_file.seek(0)
elements = pdf.partition_pdf( elements = pdf.partition_pdf(
file=spooled_temp_file, file=spooled_temp_file,
strategy=strategy, strategy=strategy,
metadata_last_modified=metadata_last_modified, metadata_last_modified=metadata_last_modified,
) )
assert {el.metadata.last_modified for el in elements} == {expected_last_modified} assert {el.metadata.last_modified for el in elements} == {expected_last_modified}
@ -1131,15 +1133,15 @@ def test_partition_pdf_with_ocr_only_strategy(
) )
else: else:
with open(filename, "rb") as test_file: with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile() with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0) spooled_temp_file.seek(0)
elements = pdf.partition_pdf( elements = pdf.partition_pdf(
file=spooled_temp_file, file=spooled_temp_file,
strategy=PartitionStrategy.OCR_ONLY, strategy=PartitionStrategy.OCR_ONLY,
languages=["eng"], languages=["eng"],
is_image=is_image, is_image=is_image,
) )
assert elements[0].metadata.languages == ["eng"] assert elements[0].metadata.languages == ["eng"]
# check pages # check pages

View File

@ -77,14 +77,15 @@ def test_partition_docx_with_spooled_file(
`python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need `python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need
to ensure the source file is appropriately converted in this case. to ensure the source file is appropriately converted in this case.
""" """
with open(mock_document_file_path, "rb") as test_file: with tempfile.SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file = tempfile.SpooledTemporaryFile() with open(mock_document_file_path, "rb") as test_file:
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0) spooled_temp_file.seek(0)
elements = partition_docx(file=spooled_temp_file) elements = partition_docx(file=spooled_temp_file)
assert elements == expected_elements
for element in elements: assert elements == expected_elements
assert element.metadata.filename is None assert all(e.metadata.filename is None for e in elements)
def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]): def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]):
@ -921,16 +922,16 @@ class DescribeDocxPartitionerOptions:
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
self, opts_args: dict[str, Any] self, opts_args: dict[str, Any]
): ):
spooled_temp_file = tempfile.SpooledTemporaryFile() with tempfile.SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(b"abcdefg") spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file opts_args["file"] = spooled_temp_file
opts = DocxPartitionerOptions(**opts_args) opts = DocxPartitionerOptions(**opts_args)
docx_file = opts._docx_file docx_file = opts._docx_file
assert docx_file is not spooled_temp_file assert docx_file is not spooled_temp_file
assert isinstance(docx_file, io.BytesIO) assert isinstance(docx_file, io.BytesIO)
assert docx_file.getvalue() == b"abcdefg" assert docx_file.getvalue() == b"abcdefg"
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile( def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
self, opts_args: dict[str, Any] self, opts_args: dict[str, Any]

View File

@ -74,10 +74,12 @@ def test_partition_pptx_with_spooled_file():
Including one that does not have its read-pointer set to the start. Including one that does not have its read-pointer set to the start.
""" """
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file: with tempfile.SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file = tempfile.SpooledTemporaryFile() with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
elements = partition_pptx(file=spooled_temp_file) elements = partition_pptx(file=spooled_temp_file)
assert elements == EXPECTED_PPTX_OUTPUT assert elements == EXPECTED_PPTX_OUTPUT
for element in elements: for element in elements:
assert element.metadata.filename is None assert element.metadata.filename is None
@ -701,16 +703,16 @@ class DescribePptxPartitionerOptions:
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
self, opts_args: dict[str, Any] self, opts_args: dict[str, Any]
): ):
spooled_temp_file = tempfile.SpooledTemporaryFile() with tempfile.SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(b"abcdefg") spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file opts_args["file"] = spooled_temp_file
opts = PptxPartitionerOptions(**opts_args) opts = PptxPartitionerOptions(**opts_args)
pptx_file = opts.pptx_file pptx_file = opts.pptx_file
assert pptx_file is not spooled_temp_file assert pptx_file is not spooled_temp_file
assert isinstance(pptx_file, io.BytesIO) assert isinstance(pptx_file, io.BytesIO)
assert pptx_file.getvalue() == b"abcdefg" assert pptx_file.getvalue() == b"abcdefg"
def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile( def and_it_uses_the_provided_file_directly_when_not_a_SpooledTemporaryFile(
self, opts_args: dict[str, Any] self, opts_args: dict[str, Any]

View File

@ -64,10 +64,12 @@ def test_partition_xlsx_from_filename():
def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji(): def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji():
f = tempfile.SpooledTemporaryFile() with tempfile.SpooledTemporaryFile() as f:
with open("example-docs/emoji.xlsx", "rb") as g: with open("example-docs/emoji.xlsx", "rb") as g:
f.write(g.read()) f.write(g.read())
elements = partition_xlsx(file=f, include_header=False)
elements = partition_xlsx(file=f, include_header=False)
assert sum(isinstance(element, Text) for element in elements) == 1 assert sum(isinstance(element, Text) for element in elements) == 1
assert len(elements) == 1 assert len(elements) == 1
assert clean_extra_whitespace(elements[0].text) == "🤠😅" assert clean_extra_whitespace(elements[0].text) == "🤠😅"

View File

@ -1 +1 @@
__version__ = "0.16.12-dev1" # pragma: no cover __version__ = "0.16.12-dev2" # pragma: no cover

View File

@ -53,7 +53,6 @@ def ontology_to_unstructured_elements(
""" """
elements_to_return = [] elements_to_return = []
if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT: if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
if page_number is None and isinstance(ontology_element, ontology.Page): if page_number is None and isinstance(ontology_element, ontology.Page):
page_number = ontology_element.page_number page_number = ontology_element.page_number
@ -200,10 +199,7 @@ def is_text_element(ontology_element: ontology.OntologyElement) -> bool:
if any(isinstance(ontology_element, class_) for class_ in text_classes): if any(isinstance(ontology_element, class_) for class_ in text_classes):
return True return True
if any(ontology_element.elementType == category for category in text_categories): return any(ontology_element.elementType == category for category in text_categories)
return True
return False
def is_inline_element(ontology_element: ontology.OntologyElement) -> bool: def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
@ -218,10 +214,7 @@ def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
if any(isinstance(ontology_element, class_) for class_ in inline_classes): if any(isinstance(ontology_element, class_) for class_ in inline_classes):
return True return True
if any(ontology_element.elementType == category for category in inline_categories): return any(ontology_element.elementType == category for category in inline_categories)
return True
return False
def unstructured_elements_to_ontology( def unstructured_elements_to_ontology(
@ -327,10 +320,7 @@ def remove_empty_tags_from_html_content(html_content: str) -> str:
if tag.attrs: if tag.attrs:
return False return False
if not tag.get_text(strip=True): return bool(not tag.get_text(strip=True))
return True
return False
def remove_empty_tags(soup): def remove_empty_tags(soup):
for tag in soup.find_all(): for tag in soup.find_all():
@ -419,8 +409,9 @@ def extract_tag_and_ontology_class_from_tag(
# Scenario 1: Valid Ontology Element # Scenario 1: Valid Ontology Element
if soup.attrs.get("class"): if soup.attrs.get("class"):
html_tag, element_class = soup.name, HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get( html_tag, element_class = (
(soup.name, soup.attrs["class"][0]) soup.name,
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get((soup.name, soup.attrs["class"][0])),
) )
# Scenario 2: HTML tag incorrect, CSS class correct # Scenario 2: HTML tag incorrect, CSS class correct