2023-09-19 15:32:46 -07:00
|
|
|
[tool.black]
|
|
|
|
line-length = 100
|
|
|
|
|
2024-02-20 17:35:16 -08:00
|
|
|
[tool.pyright]
|
|
|
|
pythonPlatform = "Linux"
|
2024-05-21 15:11:46 -07:00
|
|
|
pythonVersion = "3.9"
|
2024-02-20 17:35:16 -08:00
|
|
|
reportUnnecessaryCast = true
|
|
|
|
reportUnnecessaryTypeIgnoreComment = true
|
|
|
|
stubPath = "./typings"
|
|
|
|
typeCheckingMode = "strict"
|
|
|
|
verboseOutput = true
|
|
|
|
|
2023-09-19 15:32:46 -07:00
|
|
|
[tool.ruff]
|
|
|
|
line-length = 100
|
fix(xlsx): xlsx subtable algorithm (#2534)
**Reviewers:** It may be easier to review each of the two commits
separately. The first adds the new `_SubtableParser` object with its
unit-tests and the second one uses that object to replace the flawed
existing subtable-parsing algorithm.
**Summary**
There are a cluster of bugs in `partition_xlsx()` that all derive from
flaws in the algorithm we use to detect "subtables". These are
encountered when the user wants to get multiple document-elements from
each worksheet, which is the default (argument `find_subtable = True`).
This PR replaces the flawed existing algorithm with a `_SubtableParser`
object that encapsulates all that logic and has thorough unit-tests.
**Additional Context**
This is a summary of the failure cases. There are a few other cases but
they're closely related and this was enough evidence and scope for my
purposes. This PR fixes all these bugs:
```python
#
# -- ✅ CASE 1: There are no leading or trailing single-cell rows.
# -> this subtable functions never get called, subtable is emitted as the only element
#
# a b -> Table(a, b, c, d)
# c d
# -- ✅ CASE 2: There is exactly one leading single-cell row.
# -> Leading single-cell row emitted as `Title` element, core-table properly identified.
#
# a -> [ Title(a),
# b c Table(b, c, d, e) ]
# d e
# -- ❌ CASE 3: There are two-or-more leading single-cell rows.
# -> leading single-cell rows are included in subtable
#
# a -> [ Table(a, b, c, d, e, f) ]
# b
# c d
# e f
# -- ❌ CASE 4: There is exactly one trailing single-cell row.
# -> core table is dropped. trailing single-cell row is emitted as Title
# (this is the behavior in the reported bug)
#
# a b -> [ Title(e) ]
# c d
# e
# -- ❌ CASE 5: There are two-or-more trailing single-cell rows.
# -> core table is dropped. trailing single-cell rows are each emitted as a Title
#
# a b -> [ Title(e),
# c d Title(f) ]
# e
# f
# -- ✅ CASE 6: There are exactly one each leading and trailing single-cell rows.
# -> core table is correctly identified, leading and trailing single-cell rows are each
# emitted as a Title.
#
# a -> [ Title(a),
# b c Table(b, c, d, e),
# d e Title(f) ]
# f
# -- ✅ CASE 7: There are two leading and one trailing single-cell rows.
# -> core table is correctly identified, leading and trailing single-cell rows are each
# emitted as a Title.
#
# a -> [ Title(a),
# b Title(b),
# c d Table(c, d, e, f),
# e f Title(g) ]
# g
# -- ✅ CASE 8: There are two-or-more leading and trailing single-cell rows.
# -> core table is correctly identified, leading and trailing single-cell rows are each
# emitted as a Title.
#
# a -> [ Title(a),
# b Title(b),
# c d Table(c, d, e, f),
# e f Title(g),
# g Title(h) ]
# h
# -- ❌ CASE 9: Single-row subtable, no single-cell rows above or below.
# -> First cell is mistakenly emitted as title, remaining cells are dropped.
#
# a b c -> [ Title(a) ]
# -- ❌ CASE 10: Single-row subtable with one leading single-cell row.
# -> Leading single-row cell is correctly identified as title, core-table is mis-identified
# as a `Title` and truncated.
#
# a -> [ Title(a),
# b c d Title(b) ]
```
2024-02-13 20:29:17 -08:00
|
|
|
|
|
|
|
# -- changes made here should also be made in `.pre-commit-config.yaml` and `Makefile` --
|
2024-03-18 02:09:44 +01:00
|
|
|
lint.select = [
|
2023-09-19 15:32:46 -07:00
|
|
|
"C4", # -- flake8-comprehensions --
|
|
|
|
"COM", # -- flake8-commas --
|
|
|
|
"E", # -- pycodestyle errors --
|
|
|
|
"F", # -- pyflakes --
|
|
|
|
"I", # -- isort (imports) --
|
|
|
|
"PLR0402", # -- Name compared with itself like `foo == foo` --
|
|
|
|
"PT", # -- flake8-pytest-style --
|
|
|
|
"SIM", # -- flake8-simplify --
|
|
|
|
"UP015", # -- redundant `open()` mode parameter (like "r" is default) --
|
|
|
|
"UP018", # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) --
|
|
|
|
"UP032", # -- Use f-string instead of `.format()` call --
|
|
|
|
"UP034", # -- Avoid extraneous parentheses --
|
2024-05-16 09:50:25 -07:00
|
|
|
"W", # -- Warnings, including invalid escape-sequence --
|
2023-09-19 15:32:46 -07:00
|
|
|
]
|
2024-03-18 02:09:44 +01:00
|
|
|
lint.ignore = [
|
2023-09-19 15:32:46 -07:00
|
|
|
"COM812", # -- over aggressively insists on trailing commas where not desireable --
|
2024-06-05 14:12:27 -07:00
|
|
|
"PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) --
|
2024-03-14 14:31:58 -07:00
|
|
|
"PT005", # -- flags mock fixtures with names intentionally matching private method name --
|
2023-09-19 15:32:46 -07:00
|
|
|
"PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception --
|
|
|
|
"PT012", # -- pytest.raises() block should contain a single simple statement --
|
|
|
|
"SIM117", # -- merge `with` statements for context managers that have same scope --
|
|
|
|
]
|