feat(ingest): add basic chunking to ingest (#2380)

The new "basic" chunking strategy and overlap options need to be
available from the ingest CLI. An ingest test of those features is also
welcome, both to verify the ingest feature and to defend against
regressions in the chunking code.

Add a local ingest test exercising both the "basic" chunking strategy
and intra-chunk overlap. Since there is no new source connector
involved, use the local ingest source and destination. Update
documentation to suit, filling in some details that hadn't made it into
the docs yet.
This commit is contained in:
Steve Canny 2024-01-12 12:27:34 -08:00 committed by GitHub
parent 50f142d4e0
commit 2f2c48acd5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 841 additions and 47 deletions

View File

@ -1,10 +1,11 @@
## 0.12.1-dev3
## 0.12.1-dev4
### Enhancements
* **Add "basic" chunking strategy.** Add baseline chunking strategy that includes all shared chunking behaviors without breaking chunks on section or page boundaries.
* **Add overlap option for chunking.** Add option to overlap chunks. Intra-chunk and inter-chunk overlap are requested separately. Intra-chunk overlap is applied only to the second and later chunks formed by text-splitting an oversized chunk. Inter-chunk overlap may also be specified; this applies overlap between "normal" (not-oversized) chunks.
* **Salesforce connector accepts private key path or value.** Salesforce parameter `private-key-file` has been renamed to `private-key`. Private key can be provided as path to file or file contents.
* **Add "basic" chunking to ingest CLI.** Add options to ingest CLI allowing access to the new "basic" chunking strategy and overlap options.
### Features

View File

@ -13,8 +13,19 @@ in vector databases, enhancing the quality and relevance of the results.
Configs
---------------------
* ``chunk_elements (default False)``: Boolean flag whether to run chunking as part of the ingest process.
* ``multipage_sections (default True)``: If True, sections can span multiple pages.
* ``chunk_elements (default False)``: (Deprecated) Boolean flag whether to run chunking as part of
the ingest process. This option is deprecated in favor of the ``chunking_strategy`` option. This
option being set True has the same effect as ``chunking_strategy=by_title``.
* ``chunking_strategy``: One of "basic" or "by_title". When omitted, no chunking is performed. The
"basic" strategy maximally fills each chunk with whole elements, up the specified size limits
(``max_characters`` and ``new_after_n_chars`` described below). A single element that exceeds this
length is divided into two or more chunks using text-splitting. A ``Table`` element is never
combined with any other element and appears as a chunk of its own or as a sequence of
``TableChunk`` elements splitting is required. The "by_title" behaviors are the same except that
section and optionally page boundaries are respected such that two consecutive elements from
different sections appear in separate chunks.
* ``multipage_sections (default True)``: When False, in addition to section boundaries, page
boundaries are also respected. Only operative for the "by_title" strategy.
* ``combine_text_under_n_chars (default 500)``: Combines elements (for example a series of titles) until a section reaches a length of n characters. Defaults to `max_characters` which combines chunks whenever space allows. Specifying 0 for this argument suppresses combining of small chunks. Note this value is "capped" at the `new_after_n_chars` value since a value higher than that would not change this parameter's effect.
* ``new_after_n_chars (default 1500)``: Cuts off new sections once they reach a length of n characters (soft max). Defaults to `max_characters` when not specified, which effectively disables any soft window. Specifying 0 for this argument causes each element to appear in a chunk by itself (although an element with text longer than `max_characters` will be still be split into two or more chunks).
* ``max_characters (default 1500)``: Chunks elements text and text_as_html (if present) into chunks of length n characters (hard max)

View File

@ -0,0 +1,649 @@
[
{
"type": "CompositeElement",
"element_id": "141239c4dab75a61a9d513fc033b2c33",
"text": "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 INTRODUCTION\n\nA.\tPURPOSE",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"emphasized_text_contents": [
"CHAPTER 1",
"INTRODUCTION"
],
"emphasized_text_tags": [
"b",
"b"
],
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "bbab04cdd1540f50e5bfecfbd381df2d",
"text": "The United States Trustee appoints and supervises standing trustees and monitors and supervises cases under chapter 13 of title 11 of the United",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "2118e8faa751f3dd7865ec383e72a8bb",
"text": "le 11 of the United States Code. 28 U.S.C. § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586, establishes or clarifies the",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "3bbcf0b1c0d7fefcd5d876d2e7490d65",
"text": "es or clarifies the position of the United States Trustee Program (Program) on the duties owed by a standing trustee to the debtors, creditors, other",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "81667567d298d7ff9097eaad2ddeb30c",
"text": "s, creditors, other parties in interest, and the United States Trustee. The Handbook does not present a full and complete statement of the law; it",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "9bbadecc7ed6546e05d84fb592a52e1a",
"text": "ment of the law; it should not be used as a substitute for legal research and analysis. The standing trustee must be familiar with relevant",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "23e818f12923bb5292381a53c7132474",
"text": "iliar with relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "5deeec95f6e04d6b6f19dd27a239b8e2",
"text": ", and case law. 11 U.S.C. § 321, 28 U.S.C. § 586, 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "20bad03b8ea3a8edbe80e814d98f631a",
"text": "Tips identified in this Handbook but these are not considered mandatory.",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "b27631eca540a6a1b2029f3565b4706c",
"text": "Nothing in this Handbook should be construed to excuse the standing trustee from complying with all duties imposed by the Bankruptcy Code and Rules,",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "ff776964ebb49b8f1c79c2a2ed00e353",
"text": "tcy Code and Rules, local rules, and orders of the court. The standing trustee should notify the United States Trustee whenever the provision of the",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "74e50cee04371ad9ab9023de3e70e94a",
"text": "he provision of the Handbook conflicts with the local rules or orders of the court. The standing trustee is accountable for all duties set forth in",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "5d4e824c934173203e79473aed8c0880",
"text": "duties set forth in this Handbook, but need not personally perform any duty unless otherwise indicated. All statutory references in this Handbook",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "22b6ecadef903f8a7ad089e8972fa083",
"text": "es in this Handbook refer to the Bankruptcy Code, 11 U.S.C. § 101 et seq., unless otherwise indicated.",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "67483276a5011df9e435b054ad7fde6e",
"text": "This Handbook does not create additional rights against the standing trustee or United States Trustee in favor of other parties.",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 1
}
},
{
"type": "CompositeElement",
"element_id": "4966c7804baab3f751348b349370ec3a",
"text": "B.\tROLE OF THE UNITED STATES TRUSTEE",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "dddbded7d110a391543cad01b6692090",
"text": "The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the responsibilities for daytoday administration of cases. Debtors, creditors,",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "9f60b8193e3cfe4e59375b238fd40a9f",
"text": "Debtors, creditors, and third parties with adverse interests to the trustee were concerned that the court, which previously appointed and supervised",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "4df88af9f7e8675f393e9836e1152959",
"text": "nted and supervised the trustee, would not impartially adjudicate their rights as adversaries of that trustee. To address these concerns, judicial and",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "1b863cb4fb3ec2686cbbd4f68111c1cd",
"text": "cerns, judicial and administrative functions within the bankruptcy system were bifurcated.",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "336455884cd8fb1f3d12d3202bbf7ddb",
"text": "Many administrative functions formerly performed by the court were placed within the Department of Justice through the creation of the Program. Among",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "4060f1c8720ff306864ea03276260465",
"text": "the Program. Among the administrative functions assigned to the United States Trustee were the appointment and supervision of chapter 13 trustees./",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "102a88bc6dbcbef2f2768d34d078aaaa",
"text": "apter 13 trustees./ This Handbook is issued under the authority of the Programs enabling statutes. ",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "ba76416d7694fefb8e1c7e3d30aa93e1",
"text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "6a4103a612f3c02822142ed9f264ba55",
"text": "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The standing trustee is more than a mere disbursing agent. The",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "2d185b07953a24e5c39b6764e4e888f1",
"text": "bursing agent. The standing trustee must be personally involved in the trustee operation. If the standing trustee is or becomes unable to perform",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "b2446a962f13a1ddc963d33d59cbb955",
"text": "s unable to perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "76c356cc9d5f81ae378d264f7cd0bdfe",
"text": "States Trustee. 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b).",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "1bf977cfa6a572f2f4be43b4bc0e7799",
"text": "Although this Handbook is not intended to be a complete statutory reference, the standing trustees primary statutory duties are set forth in 11",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "7dff671218580aaa5d24fbc571ba002b",
"text": "are set forth in 11 U.S.C. § 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. § 704. These duties",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "97bee5732e2b1b11659e9cdc431a2c15",
"text": "704. These duties include, but are not limited to, the following:",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
],
"page_number": 2
}
},
{
"type": "CompositeElement",
"element_id": "8dfc77ab4089e09d4397e12306086c19",
"text": "Copyright",
"metadata": {
"data_source": {
"url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx",
"permissions_data": [
{
"mode": 33188
}
]
},
"filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"languages": [
"eng"
]
}
}
]

View File

@ -1,2 +1,2 @@
metric average sample_sd population_sd count
element-type-accuracy 0.814 0.108 0.077 2
element-type-accuracy 0.542 0.476 0.389 3

1 metric average sample_sd population_sd count
2 element-type-accuracy 0.814 0.542 0.108 0.476 0.077 0.389 2 3

View File

@ -1,3 +1,4 @@
filename doctype connector element-type-accuracy
IRS-form-1987.pdf pdf azure 0.89
handbook-1p.docx docx local-single-file-basic-chunking 0.0
page-with-formula.pdf pdf s3 0.737

1 filename doctype connector element-type-accuracy
2 IRS-form-1987.pdf pdf azure 0.89
3 handbook-1p.docx docx local-single-file-basic-chunking 0.0
4 page-with-formula.pdf pdf s3 0.737

View File

@ -1,3 +1,3 @@
metric average sample_sd population_sd count
cct-accuracy 0.806 0.247 0.24 16
cct-%missing 0.025 0.033 0.032 16
cct-accuracy 0.809 0.24 0.233 17
cct-%missing 0.025 0.032 0.031 17

1 metric average sample_sd population_sd count
2 cct-accuracy 0.806 0.809 0.247 0.24 0.24 0.233 16 17
3 cct-%missing 0.025 0.033 0.032 0.032 0.031 16 17

View File

@ -9,6 +9,7 @@ example-10k.html html local 0.727 0.037
fake-html-cp1252.html html local 0.659 0.0
ideas-page.html html local 0.93 0.033
UDHR_first_article_all.txt txt local-single-file 0.995 0.0
handbook-1p.docx docx local-single-file-basic-chunking 0.858 0.029
fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.949 0.029

1 filename doctype connector cct-accuracy cct-%missing
9 fake-html-cp1252.html html local 0.659 0.0
10 ideas-page.html html local 0.93 0.033
11 UDHR_first_article_all.txt txt local-single-file 0.995 0.0
12 handbook-1p.docx docx local-single-file-basic-chunking 0.858 0.029
13 fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0
14 layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032
15 layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.949 0.029

View File

@ -0,0 +1,52 @@
#!/usr/bin/env bash
set -e
SRC_PATH=$(dirname "$(realpath "$0")")
SCRIPT_DIR=$(dirname "$SRC_PATH")
cd "$SCRIPT_DIR"/.. || exit 1
OUTPUT_FOLDER_NAME=local-single-file-basic-chunking
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
# -- use absolute path of input file to verify passing an absolute path --
ABS_INPUT_PATH="$SCRIPT_DIR/../example-docs/handbook-1p.docx"
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
# shellcheck disable=SC1091
source "$SCRIPT_DIR"/cleanup.sh
# shellcheck disable=SC2317
function cleanup() {
cleanup_dir "$OUTPUT_DIR"
cleanup_dir "$WORK_DIR"
}
trap cleanup EXIT
RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py}
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
local \
--chunking-strategy basic \
--chunk-overlap 20 \
--chunk-max-characters 150 \
--input-path "$ABS_INPUT_PATH" \
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
--num-processes "$max_processes" \
--output-dir "$OUTPUT_DIR" \
--reprocess \
--verbose \
--work-dir "$WORK_DIR"
set +e
"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME
EXIT_CODE=$?
set -e
if [ "$EXIT_CODE" -ne 0 ]; then
echo "The last script run exited with a non-zero exit code: $EXIT_CODE."
# Handle the error or exit
fi
"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME"
exit $EXIT_CODE

View File

@ -43,6 +43,7 @@ all_tests=(
# NOTE(ryan): This test is disabled because it is triggering too many requests to the API
# 'airtable-large.sh'
'local-single-file.sh'
'local-single-file-basic-chunking.sh'
'local-single-file-with-encoding.sh'
'local-single-file-with-pdf-infer-table-structure.sh'
'notion.sh'

View File

@ -1 +1 @@
__version__ = "0.12.1-dev3" # pragma: no cover
__version__ = "0.12.1-dev4" # pragma: no cover

View File

@ -1,3 +1,5 @@
from __future__ import annotations
import json
import os.path
import typing as t
@ -8,6 +10,7 @@ from pathlib import Path
import click
from dataclasses_json.core import Json
from typing_extensions import Self
from unstructured.ingest.interfaces import (
BaseConfig,
@ -462,59 +465,112 @@ class CliChunkingConfig(ChunkingConfig, CliMixin):
["--chunk-elements"],
is_flag=True,
default=False,
help="Deprecated, use --chunking-strategy instead.",
),
click.Option(
["--chunking-strategy"],
type=click.Choice(["basic", "by_title"]),
help="The rule-set to use to form chunks. Omit to disable chunking.",
),
click.Option(
["--chunk-multipage-sections"],
is_flag=True,
default=False,
help=(
"Ignore page boundaries when chunking such that elements from two different"
" pages can appear in the same chunk. Only operative for 'by_title'"
" chunking-strategy."
),
),
click.Option(
["--chunk-combine-text-under-n-chars"],
type=int,
default=500,
show_default=True,
help=(
"Combine consecutive chunks when the first does not exceed this length and"
" the second will fit without exceeding the hard-maximum length. Only"
" operative for 'by_title' chunking-strategy."
),
),
click.Option(
["--chunk-new-after-n-chars"],
type=int,
default=1500,
show_default=True,
help=(
"Soft-maximum chunk length. Another element will not be added to a chunk of"
" this length even when it would fit without exceeding the hard-maximum"
" length."
),
),
click.Option(
["--chunk-max-characters"],
type=int,
default=1500,
show_default=True,
help=(
"Hard maximum chunk length. No chunk will exceed this length. An oversized"
" element will be divided by text-splitting to fit this window."
),
),
click.Option(
["--chunk-overlap"],
type=int,
default=0,
show_default=True,
help=(
"Prefix chunk text with last overlap=N characters of prior chunk. Only"
" applies to oversized chunks divided by text-splitting. To apply overlap to"
" non-oversized chunks use the --overlap-all option."
),
),
click.Option(
["--chunk-overlap-all"],
is_flag=True,
default=False,
help=(
"Apply overlap to chunks formed from whole elements as well as those formed"
" by text-splitting oversized elements. Overlap length is take from --overlap"
" option value."
),
),
]
return options
@classmethod
def from_dict(cls, kvs: Json, **kwargs):
def from_dict(cls, kvs: Json, **kwargs: t.Any) -> t.Optional[Self]:
"""Extension of dataclass from_dict() to avoid a naming conflict with other CLI params.
This allows CLI arguments to be prefixed with "chunking_" during CLI invocation but doesn't
require that as part of the field names in this class
"""
Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params.
This allows CLI arguments to be prepended with chunking_ during CLI invocation but
doesn't require that as part of the field names in this class
"""
if isinstance(kvs, dict):
kvs = kvs.copy()
new_kvs = {}
if "chunk_elements" in kvs:
chunk_elements = kvs.pop("chunk_elements")
if not chunk_elements:
return None
new_kvs["chunk_elements"] = chunk_elements
new_kvs.update(
{
k[len("chunk_") :]: v # noqa: E203
for k, v in kvs.items()
if k.startswith("chunk_")
},
if not isinstance(kvs, dict):
return super().from_dict(kvs=kvs, **kwargs)
options: t.Dict[str, t.Any] = kvs.copy()
chunk_elements = options.pop("chunk_elements", None)
chunking_strategy = options.pop("chunking_strategy", None)
# -- when neither are specified, chunking is not requested --
if not chunk_elements and not chunking_strategy:
return None
def iter_kv_pairs() -> t.Iterator[t.Tuple[str, t.Any]]:
# -- newer `chunking_strategy` option takes precedence over legacy `chunk_elements` --
if chunking_strategy:
yield "chunking_strategy", chunking_strategy
# -- but legacy case is still supported, equivalent to `chunking_strategy="by_title" --
elif chunk_elements:
yield "chunking_strategy", "by_title"
yield from (
(key[len("chunk_") :], value)
for key, value in options.items()
if key.startswith("chunk_")
)
if len(new_kvs.keys()) == 0:
return None
return super().from_dict(kvs=new_kvs, **kwargs)
return super().from_dict(kvs=kvs, **kwargs)
new_kvs = dict(iter_kv_pairs())
return None if len(new_kvs) == 0 else super().from_dict(kvs=new_kvs, **kwargs)
class CliPermissionsConfig(PermissionsConfig, CliMixin):

View File

@ -40,12 +40,12 @@ class EnhancedDataClassJsonMixin(DataClassJsonMixin):
allow_nan: bool = True,
indent: t.Optional[t.Union[int, str]] = None,
separators: t.Optional[t.Tuple[str, str]] = None,
default: t.Optional[t.Callable] = None,
default: t.Optional[t.Callable[..., t.Any]] = None,
sort_keys: bool = False,
redact_sensitive=False,
redacted_text="***REDACTED***",
redact_sensitive: bool = False,
redacted_text: str = "***REDACTED***",
apply_name_overload: bool = True,
**kw
**kw: t.Any,
) -> str:
return json.dumps(
self.to_dict(
@ -63,7 +63,7 @@ class EnhancedDataClassJsonMixin(DataClassJsonMixin):
separators=separators,
default=default,
sort_keys=sort_keys,
**kw
**kw,
)
@classmethod
@ -72,15 +72,15 @@ class EnhancedDataClassJsonMixin(DataClassJsonMixin):
kvs: dataclasses_json_core.Json,
*,
infer_missing=False,
apply_name_overload=False
apply_name_overload=False,
) -> A:
return dataclasses_json_core._decode_dataclass(cls, kvs, infer_missing)
def to_dict(
self,
encode_json=False,
redact_sensitive=False,
redacted_text="***REDACTED***",
encode_json: bool = False,
redact_sensitive: bool = False,
redacted_text: str = "***REDACTED***",
apply_name_overload: bool = True,
) -> t.Dict[str, dataclasses_json_core.Json]:
return _asdict(

View File

@ -14,6 +14,7 @@ from pathlib import Path
from dataclasses_json import DataClassJsonMixin
from dataclasses_json.core import Json, _decode_dataclass
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title
from unstructured.documents.elements import DataSourceMetadata
from unstructured.embed.interfaces import BaseEmbeddingEncoder, Element
@ -175,7 +176,7 @@ class FsspecConfig(FileStorageConfig):
@dataclass
class ReadConfig(BaseConfig):
# where raw documents are stored for processing, and then removed if not preserve_downloads
download_dir: str = ""
download_dir: t.Optional[str] = ""
re_download: bool = False
preserve_downloads: bool = False
download_only: bool = False
@ -213,22 +214,43 @@ class EmbeddingConfig(BaseConfig):
@dataclass
class ChunkingConfig(BaseConfig):
chunk_elements: bool = False
chunking_strategy: t.Optional[str] = None
multipage_sections: bool = True
combine_text_under_n_chars: int = 500
max_characters: int = 1500
new_after_n_chars: t.Optional[int] = None
overlap: int = 0
overlap_all: bool = False
def chunk(self, elements: t.List[Element]) -> t.List[Element]:
if self.chunk_elements:
return chunk_by_title(
chunking_strategy = (
self.chunking_strategy
if self.chunking_strategy in ("basic", "by_title")
else "by_title"
if self.chunk_elements is True
else None
)
return (
chunk_by_title(
elements=elements,
multipage_sections=self.multipage_sections,
combine_text_under_n_chars=self.combine_text_under_n_chars,
max_characters=self.max_characters,
multipage_sections=self.multipage_sections,
new_after_n_chars=self.new_after_n_chars,
overlap=self.overlap,
overlap_all=self.overlap_all,
)
else:
return elements
if chunking_strategy == "by_title"
else chunk_elements(
elements=elements,
max_characters=self.max_characters,
new_after_n_chars=self.new_after_n_chars,
overlap=self.overlap,
overlap_all=self.overlap_all,
)
if chunking_strategy == "basic"
else elements
)
@dataclass

View File

@ -1,4 +1,4 @@
from typing import IO, List, Optional
from typing import IO, Any, List, Optional
from unstructured.chunking import add_chunking_strategy
from unstructured.documents.elements import Element, process_metadata
@ -22,7 +22,7 @@ def partition_epub(
chunking_strategy: Optional[str] = None,
languages: Optional[List[str]] = ["auto"],
detect_language_per_element: bool = False,
**kwargs,
**kwargs: Any,
) -> List[Element]:
"""Partitions an EPUB document. The document is first converted to HTML and then
partitioned using partition_html.