diff --git a/CHANGELOG.md b/CHANGELOG.md index 01872aa4a..c904f3963 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,10 +1,11 @@ -## 0.12.1-dev3 +## 0.12.1-dev4 ### Enhancements * **Add "basic" chunking strategy.** Add baseline chunking strategy that includes all shared chunking behaviors without breaking chunks on section or page boundaries. * **Add overlap option for chunking.** Add option to overlap chunks. Intra-chunk and inter-chunk overlap are requested separately. Intra-chunk overlap is applied only to the second and later chunks formed by text-splitting an oversized chunk. Inter-chunk overlap may also be specified; this applies overlap between "normal" (not-oversized) chunks. * **Salesforce connector accepts private key path or value.** Salesforce parameter `private-key-file` has been renamed to `private-key`. Private key can be provided as path to file or file contents. +* **Add "basic" chunking to ingest CLI.** Add options to ingest CLI allowing access to the new "basic" chunking strategy and overlap options. ### Features diff --git a/docs/source/ingest/configs/chunking_config.rst b/docs/source/ingest/configs/chunking_config.rst index efc6b57a9..06aa1de5f 100644 --- a/docs/source/ingest/configs/chunking_config.rst +++ b/docs/source/ingest/configs/chunking_config.rst @@ -13,8 +13,19 @@ in vector databases, enhancing the quality and relevance of the results. Configs --------------------- -* ``chunk_elements (default False)``: Boolean flag whether to run chunking as part of the ingest process. -* ``multipage_sections (default True)``: If True, sections can span multiple pages. +* ``chunk_elements (default False)``: (Deprecated) Boolean flag whether to run chunking as part of + the ingest process. This option is deprecated in favor of the ``chunking_strategy`` option. This + option being set True has the same effect as ``chunking_strategy=by_title``. +* ``chunking_strategy``: One of "basic" or "by_title". When omitted, no chunking is performed. The + "basic" strategy maximally fills each chunk with whole elements, up the specified size limits + (``max_characters`` and ``new_after_n_chars`` described below). A single element that exceeds this + length is divided into two or more chunks using text-splitting. A ``Table`` element is never + combined with any other element and appears as a chunk of its own or as a sequence of + ``TableChunk`` elements splitting is required. The "by_title" behaviors are the same except that + section and optionally page boundaries are respected such that two consecutive elements from + different sections appear in separate chunks. +* ``multipage_sections (default True)``: When False, in addition to section boundaries, page + boundaries are also respected. Only operative for the "by_title" strategy. * ``combine_text_under_n_chars (default 500)``: Combines elements (for example a series of titles) until a section reaches a length of n characters. Defaults to `max_characters` which combines chunks whenever space allows. Specifying 0 for this argument suppresses combining of small chunks. Note this value is "capped" at the `new_after_n_chars` value since a value higher than that would not change this parameter's effect. * ``new_after_n_chars (default 1500)``: Cuts off new sections once they reach a length of n characters (soft max). Defaults to `max_characters` when not specified, which effectively disables any soft window. Specifying 0 for this argument causes each element to appear in a chunk by itself (although an element with text longer than `max_characters` will be still be split into two or more chunks). * ``max_characters (default 1500)``: Chunks elements text and text_as_html (if present) into chunks of length n characters (hard max) diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json new file mode 100644 index 000000000..d41d0f349 --- /dev/null +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json @@ -0,0 +1,649 @@ +[ + { + "type": "CompositeElement", + "element_id": "141239c4dab75a61a9d513fc033b2c33", + "text": "US Trustee Handbook\n\nCHAPTER 1\n\nINTRODUCTION\n\nCHAPTER 1 – INTRODUCTION\n\nA.\tPURPOSE", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "emphasized_text_contents": [ + "CHAPTER 1", + "INTRODUCTION" + ], + "emphasized_text_tags": [ + "b", + "b" + ], + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "bbab04cdd1540f50e5bfecfbd381df2d", + "text": "The United States Trustee appoints and supervises standing trustees and monitors and supervises cases under chapter 13 of title 11 of the United", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "2118e8faa751f3dd7865ec383e72a8bb", + "text": "le 11 of the United States Code. 28 U.S.C. § 586(b). The Handbook, issued as part of our duties under 28 U.S.C. § 586, establishes or clarifies the", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "3bbcf0b1c0d7fefcd5d876d2e7490d65", + "text": "es or clarifies the position of the United States Trustee Program (Program) on the duties owed by a standing trustee to the debtors, creditors, other", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "81667567d298d7ff9097eaad2ddeb30c", + "text": "s, creditors, other parties in interest, and the United States Trustee. The Handbook does not present a full and complete statement of the law; it", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "9bbadecc7ed6546e05d84fb592a52e1a", + "text": "ment of the law; it should not be used as a substitute for legal research and analysis. The standing trustee must be familiar with relevant", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "23e818f12923bb5292381a53c7132474", + "text": "iliar with relevant provisions of the Bankruptcy Code, Federal Rules of Bankruptcy Procedure (Rules), any local bankruptcy rules, and case law. 11", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "5deeec95f6e04d6b6f19dd27a239b8e2", + "text": ", and case law. 11 U.S.C. § 321, 28 U.S.C. § 586, 28 C.F.R. § 58.6(a)(3). Standing trustees are encouraged to follow Practice Tips identified in", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "20bad03b8ea3a8edbe80e814d98f631a", + "text": "Tips identified in this Handbook but these are not considered mandatory.", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "b27631eca540a6a1b2029f3565b4706c", + "text": "Nothing in this Handbook should be construed to excuse the standing trustee from complying with all duties imposed by the Bankruptcy Code and Rules,", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "ff776964ebb49b8f1c79c2a2ed00e353", + "text": "tcy Code and Rules, local rules, and orders of the court. The standing trustee should notify the United States Trustee whenever the provision of the", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "74e50cee04371ad9ab9023de3e70e94a", + "text": "he provision of the Handbook conflicts with the local rules or orders of the court. The standing trustee is accountable for all duties set forth in", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "5d4e824c934173203e79473aed8c0880", + "text": "duties set forth in this Handbook, but need not personally perform any duty unless otherwise indicated. All statutory references in this Handbook", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "22b6ecadef903f8a7ad089e8972fa083", + "text": "es in this Handbook refer to the Bankruptcy Code, 11 U.S.C. § 101 et seq., unless otherwise indicated.", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "67483276a5011df9e435b054ad7fde6e", + "text": "This Handbook does not create additional rights against the standing trustee or United States Trustee in favor of other parties.", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 1 + } + }, + { + "type": "CompositeElement", + "element_id": "4966c7804baab3f751348b349370ec3a", + "text": "B.\tROLE OF THE UNITED STATES TRUSTEE", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "dddbded7d110a391543cad01b6692090", + "text": "The Bankruptcy Reform Act of 1978 removed the bankruptcy judge from the responsibilities for daytoday administration of cases. Debtors, creditors,", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "9f60b8193e3cfe4e59375b238fd40a9f", + "text": "Debtors, creditors, and third parties with adverse interests to the trustee were concerned that the court, which previously appointed and supervised", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "4df88af9f7e8675f393e9836e1152959", + "text": "nted and supervised the trustee, would not impartially adjudicate their rights as adversaries of that trustee. To address these concerns, judicial and", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "1b863cb4fb3ec2686cbbd4f68111c1cd", + "text": "cerns, judicial and administrative functions within the bankruptcy system were bifurcated.", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "336455884cd8fb1f3d12d3202bbf7ddb", + "text": "Many administrative functions formerly performed by the court were placed within the Department of Justice through the creation of the Program. Among", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "4060f1c8720ff306864ea03276260465", + "text": "the Program. Among the administrative functions assigned to the United States Trustee were the appointment and supervision of chapter 13 trustees./", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "102a88bc6dbcbef2f2768d34d078aaaa", + "text": "apter 13 trustees./ This Handbook is issued under the authority of the Program’s enabling statutes. ", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "ba76416d7694fefb8e1c7e3d30aa93e1", + "text": "C.\tSTATUTORY DUTIES OF A STANDING TRUSTEE\t", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "6a4103a612f3c02822142ed9f264ba55", + "text": "The standing trustee has a fiduciary responsibility to the bankruptcy estate. The standing trustee is more than a mere disbursing agent. The", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "2d185b07953a24e5c39b6764e4e888f1", + "text": "bursing agent. The standing trustee must be personally involved in the trustee operation. If the standing trustee is or becomes unable to perform", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "b2446a962f13a1ddc963d33d59cbb955", + "text": "s unable to perform the duties and responsibilities of a standing trustee, the standing trustee must immediately advise the United States Trustee. 28", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "76c356cc9d5f81ae378d264f7cd0bdfe", + "text": "States Trustee. 28 U.S.C. § 586(b), 28 C.F.R. § 58.4(b) referencing 28 C.F.R. § 58.3(b).", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "1bf977cfa6a572f2f4be43b4bc0e7799", + "text": "Although this Handbook is not intended to be a complete statutory reference, the standing trustee’s primary statutory duties are set forth in 11", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "7dff671218580aaa5d24fbc571ba002b", + "text": "are set forth in 11 U.S.C. § 1302, which incorporates by reference some of the duties of chapter 7 trustees found in 11 U.S.C. § 704. These duties", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "97bee5732e2b1b11659e9cdc431a2c15", + "text": "704. These duties include, but are not limited to, the following:", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ], + "page_number": 2 + } + }, + { + "type": "CompositeElement", + "element_id": "8dfc77ab4089e09d4397e12306086c19", + "text": "Copyright", + "metadata": { + "data_source": { + "url": "/home/runner/work/unstructured/unstructured/test_unstructured_ingest/../example-docs/handbook-1p.docx", + "permissions_data": [ + { + "mode": 33188 + } + ] + }, + "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "languages": [ + "eng" + ] + } + } +] \ No newline at end of file diff --git a/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv b/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv index 526bd6f08..fcadd93c2 100644 --- a/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv +++ b/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv @@ -1,2 +1,2 @@ metric average sample_sd population_sd count -element-type-accuracy 0.814 0.108 0.077 2 +element-type-accuracy 0.542 0.476 0.389 3 diff --git a/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv b/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv index aa368f746..fb6568275 100644 --- a/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv +++ b/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv @@ -1,3 +1,4 @@ filename doctype connector element-type-accuracy IRS-form-1987.pdf pdf azure 0.89 +handbook-1p.docx docx local-single-file-basic-chunking 0.0 page-with-formula.pdf pdf s3 0.737 diff --git a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv index e81eefc50..8041138ab 100644 --- a/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv +++ b/test_unstructured_ingest/metrics/text-extraction/aggregate-scores-cct.tsv @@ -1,3 +1,3 @@ metric average sample_sd population_sd count -cct-accuracy 0.806 0.247 0.24 16 -cct-%missing 0.025 0.033 0.032 16 +cct-accuracy 0.809 0.24 0.233 17 +cct-%missing 0.025 0.032 0.031 17 diff --git a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv index 53059f306..6770d8458 100644 --- a/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv +++ b/test_unstructured_ingest/metrics/text-extraction/all-docs-cct.tsv @@ -9,6 +9,7 @@ example-10k.html html local 0.727 0.037 fake-html-cp1252.html html local 0.659 0.0 ideas-page.html html local 0.93 0.033 UDHR_first_article_all.txt txt local-single-file 0.995 0.0 +handbook-1p.docx docx local-single-file-basic-chunking 0.858 0.029 fake-html-cp1252.html html local-single-file-with-encoding 0.659 0.0 layout-parser-paper-with-table.jpg jpg local-single-file-with-pdf-infer-table-structure 0.716 0.032 layout-parser-paper.pdf pdf local-single-file-with-pdf-infer-table-structure 0.949 0.029 diff --git a/test_unstructured_ingest/src/local-single-file-basic-chunking.sh b/test_unstructured_ingest/src/local-single-file-basic-chunking.sh new file mode 100755 index 000000000..7786e1c63 --- /dev/null +++ b/test_unstructured_ingest/src/local-single-file-basic-chunking.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +set -e + +SRC_PATH=$(dirname "$(realpath "$0")") +SCRIPT_DIR=$(dirname "$SRC_PATH") +cd "$SCRIPT_DIR"/.. || exit 1 +OUTPUT_FOLDER_NAME=local-single-file-basic-chunking +OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR} +OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME +WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME +# -- use absolute path of input file to verify passing an absolute path -- +ABS_INPUT_PATH="$SCRIPT_DIR/../example-docs/handbook-1p.docx" +max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")} + +# shellcheck disable=SC1091 +source "$SCRIPT_DIR"/cleanup.sh +# shellcheck disable=SC2317 +function cleanup() { + cleanup_dir "$OUTPUT_DIR" + cleanup_dir "$WORK_DIR" +} +trap cleanup EXIT + +RUN_SCRIPT=${RUN_SCRIPT:-./unstructured/ingest/main.py} + +PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \ + local \ + --chunking-strategy basic \ + --chunk-overlap 20 \ + --chunk-max-characters 150 \ + --input-path "$ABS_INPUT_PATH" \ + --metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \ + --num-processes "$max_processes" \ + --output-dir "$OUTPUT_DIR" \ + --reprocess \ + --verbose \ + --work-dir "$WORK_DIR" + +set +e +"$SCRIPT_DIR"/check-diff-expected-output.sh $OUTPUT_FOLDER_NAME +EXIT_CODE=$? +set -e + +if [ "$EXIT_CODE" -ne 0 ]; then + echo "The last script run exited with a non-zero exit code: $EXIT_CODE." + # Handle the error or exit +fi + +"$SCRIPT_DIR"/evaluation-ingest-cp.sh "$OUTPUT_DIR" "$OUTPUT_FOLDER_NAME" + +exit $EXIT_CODE diff --git a/test_unstructured_ingest/test-ingest-src.sh b/test_unstructured_ingest/test-ingest-src.sh index d8118d73f..4bd8a4f72 100755 --- a/test_unstructured_ingest/test-ingest-src.sh +++ b/test_unstructured_ingest/test-ingest-src.sh @@ -43,6 +43,7 @@ all_tests=( # NOTE(ryan): This test is disabled because it is triggering too many requests to the API # 'airtable-large.sh' 'local-single-file.sh' + 'local-single-file-basic-chunking.sh' 'local-single-file-with-encoding.sh' 'local-single-file-with-pdf-infer-table-structure.sh' 'notion.sh' diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 7b1a13587..2a145450a 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.12.1-dev3" # pragma: no cover +__version__ = "0.12.1-dev4" # pragma: no cover diff --git a/unstructured/ingest/cli/interfaces.py b/unstructured/ingest/cli/interfaces.py index 38aab066c..df2ad5a5e 100644 --- a/unstructured/ingest/cli/interfaces.py +++ b/unstructured/ingest/cli/interfaces.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import json import os.path import typing as t @@ -8,6 +10,7 @@ from pathlib import Path import click from dataclasses_json.core import Json +from typing_extensions import Self from unstructured.ingest.interfaces import ( BaseConfig, @@ -462,59 +465,112 @@ class CliChunkingConfig(ChunkingConfig, CliMixin): ["--chunk-elements"], is_flag=True, default=False, + help="Deprecated, use --chunking-strategy instead.", + ), + click.Option( + ["--chunking-strategy"], + type=click.Choice(["basic", "by_title"]), + help="The rule-set to use to form chunks. Omit to disable chunking.", ), click.Option( ["--chunk-multipage-sections"], is_flag=True, default=False, + help=( + "Ignore page boundaries when chunking such that elements from two different" + " pages can appear in the same chunk. Only operative for 'by_title'" + " chunking-strategy." + ), ), click.Option( ["--chunk-combine-text-under-n-chars"], type=int, default=500, show_default=True, + help=( + "Combine consecutive chunks when the first does not exceed this length and" + " the second will fit without exceeding the hard-maximum length. Only" + " operative for 'by_title' chunking-strategy." + ), ), click.Option( ["--chunk-new-after-n-chars"], type=int, default=1500, show_default=True, + help=( + "Soft-maximum chunk length. Another element will not be added to a chunk of" + " this length even when it would fit without exceeding the hard-maximum" + " length." + ), ), click.Option( ["--chunk-max-characters"], type=int, default=1500, show_default=True, + help=( + "Hard maximum chunk length. No chunk will exceed this length. An oversized" + " element will be divided by text-splitting to fit this window." + ), + ), + click.Option( + ["--chunk-overlap"], + type=int, + default=0, + show_default=True, + help=( + "Prefix chunk text with last overlap=N characters of prior chunk. Only" + " applies to oversized chunks divided by text-splitting. To apply overlap to" + " non-oversized chunks use the --overlap-all option." + ), + ), + click.Option( + ["--chunk-overlap-all"], + is_flag=True, + default=False, + help=( + "Apply overlap to chunks formed from whole elements as well as those formed" + " by text-splitting oversized elements. Overlap length is take from --overlap" + " option value." + ), ), ] return options @classmethod - def from_dict(cls, kvs: Json, **kwargs): + def from_dict(cls, kvs: Json, **kwargs: t.Any) -> t.Optional[Self]: + """Extension of dataclass from_dict() to avoid a naming conflict with other CLI params. + + This allows CLI arguments to be prefixed with "chunking_" during CLI invocation but doesn't + require that as part of the field names in this class """ - Extension of the dataclass from_dict() to avoid a naming conflict with other CLI params. - This allows CLI arguments to be prepended with chunking_ during CLI invocation but - doesn't require that as part of the field names in this class - """ - if isinstance(kvs, dict): - kvs = kvs.copy() - new_kvs = {} - if "chunk_elements" in kvs: - chunk_elements = kvs.pop("chunk_elements") - if not chunk_elements: - return None - new_kvs["chunk_elements"] = chunk_elements - new_kvs.update( - { - k[len("chunk_") :]: v # noqa: E203 - for k, v in kvs.items() - if k.startswith("chunk_") - }, + if not isinstance(kvs, dict): + return super().from_dict(kvs=kvs, **kwargs) + + options: t.Dict[str, t.Any] = kvs.copy() + chunk_elements = options.pop("chunk_elements", None) + chunking_strategy = options.pop("chunking_strategy", None) + # -- when neither are specified, chunking is not requested -- + if not chunk_elements and not chunking_strategy: + return None + + def iter_kv_pairs() -> t.Iterator[t.Tuple[str, t.Any]]: + # -- newer `chunking_strategy` option takes precedence over legacy `chunk_elements` -- + if chunking_strategy: + yield "chunking_strategy", chunking_strategy + # -- but legacy case is still supported, equivalent to `chunking_strategy="by_title" -- + elif chunk_elements: + yield "chunking_strategy", "by_title" + + yield from ( + (key[len("chunk_") :], value) + for key, value in options.items() + if key.startswith("chunk_") ) - if len(new_kvs.keys()) == 0: - return None - return super().from_dict(kvs=new_kvs, **kwargs) - return super().from_dict(kvs=kvs, **kwargs) + + new_kvs = dict(iter_kv_pairs()) + return None if len(new_kvs) == 0 else super().from_dict(kvs=new_kvs, **kwargs) class CliPermissionsConfig(PermissionsConfig, CliMixin): diff --git a/unstructured/ingest/enhanced_dataclass/json_mixin.py b/unstructured/ingest/enhanced_dataclass/json_mixin.py index 493cebdf0..8ea50ddb0 100644 --- a/unstructured/ingest/enhanced_dataclass/json_mixin.py +++ b/unstructured/ingest/enhanced_dataclass/json_mixin.py @@ -40,12 +40,12 @@ class EnhancedDataClassJsonMixin(DataClassJsonMixin): allow_nan: bool = True, indent: t.Optional[t.Union[int, str]] = None, separators: t.Optional[t.Tuple[str, str]] = None, - default: t.Optional[t.Callable] = None, + default: t.Optional[t.Callable[..., t.Any]] = None, sort_keys: bool = False, - redact_sensitive=False, - redacted_text="***REDACTED***", + redact_sensitive: bool = False, + redacted_text: str = "***REDACTED***", apply_name_overload: bool = True, - **kw + **kw: t.Any, ) -> str: return json.dumps( self.to_dict( @@ -63,7 +63,7 @@ class EnhancedDataClassJsonMixin(DataClassJsonMixin): separators=separators, default=default, sort_keys=sort_keys, - **kw + **kw, ) @classmethod @@ -72,15 +72,15 @@ class EnhancedDataClassJsonMixin(DataClassJsonMixin): kvs: dataclasses_json_core.Json, *, infer_missing=False, - apply_name_overload=False + apply_name_overload=False, ) -> A: return dataclasses_json_core._decode_dataclass(cls, kvs, infer_missing) def to_dict( self, - encode_json=False, - redact_sensitive=False, - redacted_text="***REDACTED***", + encode_json: bool = False, + redact_sensitive: bool = False, + redacted_text: str = "***REDACTED***", apply_name_overload: bool = True, ) -> t.Dict[str, dataclasses_json_core.Json]: return _asdict( diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index 6feea21f4..6baf214ac 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -14,6 +14,7 @@ from pathlib import Path from dataclasses_json import DataClassJsonMixin from dataclasses_json.core import Json, _decode_dataclass +from unstructured.chunking.basic import chunk_elements from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import DataSourceMetadata from unstructured.embed.interfaces import BaseEmbeddingEncoder, Element @@ -175,7 +176,7 @@ class FsspecConfig(FileStorageConfig): @dataclass class ReadConfig(BaseConfig): # where raw documents are stored for processing, and then removed if not preserve_downloads - download_dir: str = "" + download_dir: t.Optional[str] = "" re_download: bool = False preserve_downloads: bool = False download_only: bool = False @@ -213,22 +214,43 @@ class EmbeddingConfig(BaseConfig): @dataclass class ChunkingConfig(BaseConfig): chunk_elements: bool = False + chunking_strategy: t.Optional[str] = None multipage_sections: bool = True combine_text_under_n_chars: int = 500 max_characters: int = 1500 new_after_n_chars: t.Optional[int] = None + overlap: int = 0 + overlap_all: bool = False def chunk(self, elements: t.List[Element]) -> t.List[Element]: - if self.chunk_elements: - return chunk_by_title( + chunking_strategy = ( + self.chunking_strategy + if self.chunking_strategy in ("basic", "by_title") + else "by_title" + if self.chunk_elements is True + else None + ) + return ( + chunk_by_title( elements=elements, - multipage_sections=self.multipage_sections, combine_text_under_n_chars=self.combine_text_under_n_chars, max_characters=self.max_characters, + multipage_sections=self.multipage_sections, new_after_n_chars=self.new_after_n_chars, + overlap=self.overlap, + overlap_all=self.overlap_all, ) - else: - return elements + if chunking_strategy == "by_title" + else chunk_elements( + elements=elements, + max_characters=self.max_characters, + new_after_n_chars=self.new_after_n_chars, + overlap=self.overlap, + overlap_all=self.overlap_all, + ) + if chunking_strategy == "basic" + else elements + ) @dataclass diff --git a/unstructured/partition/epub.py b/unstructured/partition/epub.py index b27b90627..32891f3b1 100644 --- a/unstructured/partition/epub.py +++ b/unstructured/partition/epub.py @@ -1,4 +1,4 @@ -from typing import IO, List, Optional +from typing import IO, Any, List, Optional from unstructured.chunking import add_chunking_strategy from unstructured.documents.elements import Element, process_metadata @@ -22,7 +22,7 @@ def partition_epub( chunking_strategy: Optional[str] = None, languages: Optional[List[str]] = ["auto"], detect_language_per_element: bool = False, - **kwargs, + **kwargs: Any, ) -> List[Element]: """Partitions an EPUB document. The document is first converted to HTML and then partitioned using partition_html.