build: remove ruff version upper bound (#3829)

**Summary**
Remove pin on `ruff` linter and fix the handful of lint errors a newer
version catches.
This commit is contained in:
Steve Canny 2024-12-16 15:01:22 -08:00 committed by GitHub
parent b092fb7f47
commit 10f0d54ac2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 81 additions and 83 deletions

View File

@ -1,4 +1,4 @@
## 0.16.12-dev1 ## 0.16.12-dev2
### Enhancements ### Enhancements
@ -8,6 +8,8 @@
### Fixes ### Fixes
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
## 0.16.11 ## 0.16.11
### Enhancements ### Enhancements

View File

@ -12,9 +12,17 @@ verboseOutput = true
[tool.ruff] [tool.ruff]
line-length = 100 line-length = 100
target-version = "py39"
# -- changes made here should also be made in `.pre-commit-config.yaml` and `Makefile` -- [tool.ruff.lint]
lint.select = [ ignore = [
"COM812", # -- over aggressively insists on trailing commas where not desireable --
"PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) --
"PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception --
"PT012", # -- pytest.raises() block should contain a single simple statement --
"SIM117", # -- merge `with` statements for context managers that have same scope --
]
select = [
"C4", # -- flake8-comprehensions -- "C4", # -- flake8-comprehensions --
"COM", # -- flake8-commas -- "COM", # -- flake8-commas --
"E", # -- pycodestyle errors -- "E", # -- pycodestyle errors --
@ -29,11 +37,3 @@ lint.select = [
"UP034", # -- Avoid extraneous parentheses -- "UP034", # -- Avoid extraneous parentheses --
"W", # -- Warnings, including invalid escape-sequence -- "W", # -- Warnings, including invalid escape-sequence --
] ]
lint.ignore = [
"COM812", # -- over aggressively insists on trailing commas where not desireable --
"PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) --
"PT005", # -- flags mock fixtures with names intentionally matching private method name --
"PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception --
"PT012", # -- pytest.raises() block should contain a single simple statement --
"SIM117", # -- merge `with` statements for context managers that have same scope --
]

View File

@ -11,9 +11,7 @@ mypy
pydantic pydantic
pytest-cov pytest-cov
pytest-mock pytest-mock
# NOTE(robison) - we need to do additional cleanup to pass ruff
# linting for the latest version of ruff
ruff<0.5.0
types-Markdown types-Markdown
types-requests types-requests
types-tabulate types-tabulate

View File

@ -171,7 +171,7 @@ requests==2.32.3
# requests-mock # requests-mock
requests-mock==1.12.1 requests-mock==1.12.1
# via label-studio-sdk # via label-studio-sdk
ruff==0.4.10 ruff==0.8.3
# via -r ./test.in # via -r ./test.in
semantic-version==2.10.0 semantic-version==2.10.0
# via liccheck # via liccheck

View File

@ -245,11 +245,13 @@ def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
_test(result) _test(result)
else: else:
with open(filename, "rb") as test_file: with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile() with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0) spooled_temp_file.seek(0)
result = pdf.partition_pdf( result = pdf.partition_pdf(
file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number file=spooled_temp_file,
strategy=strategy,
starting_page_number=starting_page_number,
) )
_test(result) _test(result)
@ -757,7 +759,7 @@ def test_partition_pdf_metadata_date(
) )
else: else:
with open(filename, "rb") as test_file: with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile() with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0) spooled_temp_file.seek(0)
elements = pdf.partition_pdf( elements = pdf.partition_pdf(
@ -1131,7 +1133,7 @@ def test_partition_pdf_with_ocr_only_strategy(
) )
else: else:
with open(filename, "rb") as test_file: with open(filename, "rb") as test_file:
spooled_temp_file = SpooledTemporaryFile() with SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0) spooled_temp_file.seek(0)
elements = pdf.partition_pdf( elements = pdf.partition_pdf(

View File

@ -77,14 +77,15 @@ def test_partition_docx_with_spooled_file(
`python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need `python-docx` will NOT accept a `SpooledTemporaryFile` in Python versions before 3.11 so we need
to ensure the source file is appropriately converted in this case. to ensure the source file is appropriately converted in this case.
""" """
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
with open(mock_document_file_path, "rb") as test_file: with open(mock_document_file_path, "rb") as test_file:
spooled_temp_file = tempfile.SpooledTemporaryFile()
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
spooled_temp_file.seek(0) spooled_temp_file.seek(0)
elements = partition_docx(file=spooled_temp_file) elements = partition_docx(file=spooled_temp_file)
assert elements == expected_elements assert elements == expected_elements
for element in elements: assert all(e.metadata.filename is None for e in elements)
assert element.metadata.filename is None
def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]): def test_partition_docx_from_file(mock_document_file_path: str, expected_elements: list[Text]):
@ -921,7 +922,7 @@ class DescribeDocxPartitionerOptions:
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
self, opts_args: dict[str, Any] self, opts_args: dict[str, Any]
): ):
spooled_temp_file = tempfile.SpooledTemporaryFile() with tempfile.SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(b"abcdefg") spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file opts_args["file"] = spooled_temp_file
opts = DocxPartitionerOptions(**opts_args) opts = DocxPartitionerOptions(**opts_args)

View File

@ -74,10 +74,12 @@ def test_partition_pptx_with_spooled_file():
Including one that does not have its read-pointer set to the start. Including one that does not have its read-pointer set to the start.
""" """
with tempfile.SpooledTemporaryFile() as spooled_temp_file:
with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file: with open(example_doc_path("fake-power-point.pptx"), "rb") as test_file:
spooled_temp_file = tempfile.SpooledTemporaryFile()
spooled_temp_file.write(test_file.read()) spooled_temp_file.write(test_file.read())
elements = partition_pptx(file=spooled_temp_file) elements = partition_pptx(file=spooled_temp_file)
assert elements == EXPECTED_PPTX_OUTPUT assert elements == EXPECTED_PPTX_OUTPUT
for element in elements: for element in elements:
assert element.metadata.filename is None assert element.metadata.filename is None
@ -701,7 +703,7 @@ class DescribePptxPartitionerOptions:
def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided( def and_it_uses_a_BytesIO_file_to_replaces_a_SpooledTemporaryFile_provided(
self, opts_args: dict[str, Any] self, opts_args: dict[str, Any]
): ):
spooled_temp_file = tempfile.SpooledTemporaryFile() with tempfile.SpooledTemporaryFile() as spooled_temp_file:
spooled_temp_file.write(b"abcdefg") spooled_temp_file.write(b"abcdefg")
opts_args["file"] = spooled_temp_file opts_args["file"] = spooled_temp_file
opts = PptxPartitionerOptions(**opts_args) opts = PptxPartitionerOptions(**opts_args)

View File

@ -64,10 +64,12 @@ def test_partition_xlsx_from_filename():
def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji(): def test_partition_xlsx_from_SpooledTemporaryFile_with_emoji():
f = tempfile.SpooledTemporaryFile() with tempfile.SpooledTemporaryFile() as f:
with open("example-docs/emoji.xlsx", "rb") as g: with open("example-docs/emoji.xlsx", "rb") as g:
f.write(g.read()) f.write(g.read())
elements = partition_xlsx(file=f, include_header=False) elements = partition_xlsx(file=f, include_header=False)
assert sum(isinstance(element, Text) for element in elements) == 1 assert sum(isinstance(element, Text) for element in elements) == 1
assert len(elements) == 1 assert len(elements) == 1
assert clean_extra_whitespace(elements[0].text) == "🤠😅" assert clean_extra_whitespace(elements[0].text) == "🤠😅"

View File

@ -1 +1 @@
__version__ = "0.16.12-dev1" # pragma: no cover __version__ = "0.16.12-dev2" # pragma: no cover

View File

@ -53,7 +53,6 @@ def ontology_to_unstructured_elements(
""" """
elements_to_return = [] elements_to_return = []
if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT: if ontology_element.elementType == ontology.ElementTypeEnum.layout and depth <= RECURSION_LIMIT:
if page_number is None and isinstance(ontology_element, ontology.Page): if page_number is None and isinstance(ontology_element, ontology.Page):
page_number = ontology_element.page_number page_number = ontology_element.page_number
@ -200,10 +199,7 @@ def is_text_element(ontology_element: ontology.OntologyElement) -> bool:
if any(isinstance(ontology_element, class_) for class_ in text_classes): if any(isinstance(ontology_element, class_) for class_ in text_classes):
return True return True
if any(ontology_element.elementType == category for category in text_categories): return any(ontology_element.elementType == category for category in text_categories)
return True
return False
def is_inline_element(ontology_element: ontology.OntologyElement) -> bool: def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
@ -218,10 +214,7 @@ def is_inline_element(ontology_element: ontology.OntologyElement) -> bool:
if any(isinstance(ontology_element, class_) for class_ in inline_classes): if any(isinstance(ontology_element, class_) for class_ in inline_classes):
return True return True
if any(ontology_element.elementType == category for category in inline_categories): return any(ontology_element.elementType == category for category in inline_categories)
return True
return False
def unstructured_elements_to_ontology( def unstructured_elements_to_ontology(
@ -327,10 +320,7 @@ def remove_empty_tags_from_html_content(html_content: str) -> str:
if tag.attrs: if tag.attrs:
return False return False
if not tag.get_text(strip=True): return bool(not tag.get_text(strip=True))
return True
return False
def remove_empty_tags(soup): def remove_empty_tags(soup):
for tag in soup.find_all(): for tag in soup.find_all():
@ -419,8 +409,9 @@ def extract_tag_and_ontology_class_from_tag(
# Scenario 1: Valid Ontology Element # Scenario 1: Valid Ontology Element
if soup.attrs.get("class"): if soup.attrs.get("class"):
html_tag, element_class = soup.name, HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get( html_tag, element_class = (
(soup.name, soup.attrs["class"][0]) soup.name,
HTML_TAG_AND_CSS_NAME_TO_ELEMENT_TYPE_MAP.get((soup.name, soup.attrs["class"][0])),
) )
# Scenario 2: HTML tag incorrect, CSS class correct # Scenario 2: HTML tag incorrect, CSS class correct