| 
									
										
										
										
											2023-02-14 12:27:45 -08:00
										 |  |  | #!/usr/bin/env bash
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-29 08:41:19 -05:00
										 |  |  | set -u -o pipefail | 
					
						
							| 
									
										
										
										
											2023-02-14 12:27:45 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-12-11 20:04:15 -05:00
										 |  |  | SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) | 
					
						
							| 
									
										
										
										
											2023-11-29 08:41:19 -05:00
										 |  |  | SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt | 
					
						
							|  |  |  | # If the file already exists, reset it | 
					
						
							|  |  |  | if [ -f "$SKIPPED_FILES_LOG" ]; then | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   rm "$SKIPPED_FILES_LOG" | 
					
						
							| 
									
										
										
										
											2023-11-29 08:41:19 -05:00
										 |  |  | fi | 
					
						
							| 
									
										
										
										
											2023-11-29 13:31:59 -05:00
										 |  |  | touch "$SKIPPED_FILES_LOG" | 
					
						
							| 
									
										
										
										
											2023-02-21 10:15:33 -08:00
										 |  |  | cd "$SCRIPT_DIR"/.. || exit 1 | 
					
						
							| 
									
										
										
										
											2023-02-16 08:45:50 -08:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-04-11 00:11:50 -07:00
										 |  |  | # NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs | 
					
						
							|  |  |  | export OMP_THREAD_LIMIT=1 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 09:39:34 -07:00
										 |  |  | all_tests=( | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   's3.sh' | 
					
						
							|  |  |  |   's3-minio.sh' | 
					
						
							|  |  |  |   'azure.sh' | 
					
						
							|  |  |  |   'biomed-api.sh' | 
					
						
							|  |  |  |   'biomed-path.sh' | 
					
						
							|  |  |  |   # NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files | 
					
						
							|  |  |  |   'pdf-fast-reprocess.sh' | 
					
						
							|  |  |  |   'salesforce.sh' | 
					
						
							|  |  |  |   'box.sh' | 
					
						
							|  |  |  |   'discord.sh' | 
					
						
							|  |  |  |   'dropbox.sh' | 
					
						
							|  |  |  |   'github.sh' | 
					
						
							|  |  |  |   'gitlab.sh' | 
					
						
							|  |  |  |   'google-drive.sh' | 
					
						
							|  |  |  |   'wikipedia.sh' | 
					
						
							|  |  |  |   'local.sh' | 
					
						
							|  |  |  |   'slack.sh' | 
					
						
							|  |  |  |   'against-api.sh' | 
					
						
							|  |  |  |   'gcs.sh' | 
					
						
							|  |  |  |   'onedrive.sh' | 
					
						
							|  |  |  |   'outlook.sh' | 
					
						
							|  |  |  |   'elasticsearch.sh' | 
					
						
							|  |  |  |   'confluence-diff.sh' | 
					
						
							|  |  |  |   'confluence-large.sh' | 
					
						
							|  |  |  |   'airtable-diff.sh' | 
					
						
							|  |  |  |   # NOTE(ryan): This test is disabled because it is triggering too many requests to the API | 
					
						
							|  |  |  |   # 'airtable-large.sh' | 
					
						
							|  |  |  |   'local-single-file.sh' | 
					
						
							| 
									
										
										
										
											2024-01-12 12:27:34 -08:00
										 |  |  |   'local-single-file-basic-chunking.sh' | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   'local-single-file-with-encoding.sh' | 
					
						
							|  |  |  |   'local-single-file-with-pdf-infer-table-structure.sh' | 
					
						
							|  |  |  |   'notion.sh' | 
					
						
							|  |  |  |   'delta-table.sh' | 
					
						
							|  |  |  |   'jira.sh' | 
					
						
							|  |  |  |   'sharepoint.sh' | 
					
						
							|  |  |  |   'sharepoint-with-permissions.sh' | 
					
						
							|  |  |  |   'hubspot.sh' | 
					
						
							|  |  |  |   'local-embed.sh' | 
					
						
							|  |  |  |   'sftp.sh' | 
					
						
							| 
									
										
										
										
											2024-01-16 12:56:29 -08:00
										 |  |  |   'mongodb.sh' | 
					
						
							| 
									
										
										
										
											2024-01-16 20:31:49 -08:00
										 |  |  |   'opensearch.sh' | 
					
						
							| 
									
										
										
										
											2023-09-21 14:51:08 -04:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 09:39:34 -07:00
										 |  |  | full_python_matrix_tests=( | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   'sharepoint.sh' | 
					
						
							|  |  |  |   'local.sh' | 
					
						
							|  |  |  |   'local-single-file.sh' | 
					
						
							|  |  |  |   'local-single-file-with-encoding.sh' | 
					
						
							|  |  |  |   'local-single-file-with-pdf-infer-table-structure.sh' | 
					
						
							|  |  |  |   's3.sh' | 
					
						
							|  |  |  |   'google-drive.sh' | 
					
						
							|  |  |  |   'gcs.sh' | 
					
						
							|  |  |  |   'azure.sh' | 
					
						
							| 
									
										
										
										
											2023-10-10 09:39:34 -07:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | CURRENT_TEST="none" | 
					
						
							| 
									
										
										
										
											2023-09-21 14:51:08 -04:00
										 |  |  | 
 | 
					
						
							|  |  |  | function print_last_run() { | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   if [ "$CURRENT_TEST" != "none" ]; then | 
					
						
							|  |  |  |     echo "Last ran script: $CURRENT_TEST" | 
					
						
							|  |  |  |   fi | 
					
						
							|  |  |  |   echo "######## SKIPPED TESTS: ########" | 
					
						
							|  |  |  |   cat "$SKIPPED_FILES_LOG" | 
					
						
							| 
									
										
										
										
											2023-09-21 14:51:08 -04:00
										 |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | trap print_last_run EXIT | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 09:39:34 -07:00
										 |  |  | python_version=$(python --version 2>&1) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
											  
											
												fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
    "mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
    "version": [
        {"text": "current: v1.7.2", "start": 7, "end": 21},
        {"text": "supersedes: v1.7.0", "start": 22, "end": 40},
    ],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
    Title(
        "Lorem Ipsum",
        metadata=ElementMetadata(
            regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
        ),
    ),
    Text(
        "Lorem ipsum dolor sit amet consectetur adipiscing elit.",
        metadata=ElementMetadata(
            regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
        ),
    ),
    Text(
        "In rhoncus ipsum sed lectus porta volutpat.",
        metadata=ElementMetadata(
            regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
        ),
    ),
]
chunks = chunk_by_title(elements)
assert chunks == [
    CompositeElement(
        "Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
        " ipsum sed lectus porta volutpat."
    )
]
```
Observed behavior looked like this:
```python
chunks => [
    CompositeElement('Lorem Ipsum')
    CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
    CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
    Title(
        "Lorem Ipsum",
        metadata=ElementMetadata(
            regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
        ),
    ),
    Text(
        "Lorem ipsum dolor sit amet consectetur adipiscing elit.",
        metadata=ElementMetadata(
            regex_metadata={
                "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
                "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
            }
        ),
    ),
    Text(
        "In rhoncus ipsum sed lectus porta volutpat.",
        metadata=ElementMetadata(
            regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
        ),
    ),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
    "Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
    " ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
    "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
    "ipsum": [
        RegexMetadata(text="Ipsum", start=6, end=11),
        RegexMetadata(text="ipsum", start=19, end=24),
        RegexMetadata(text="ipsum", start=81, end=86),
    ],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
											
										 
											2023-10-19 20:16:02 -07:00
										 |  |  | tests_to_ignore=( | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   'notion.sh' | 
					
						
							|  |  |  |   'dropbox.sh' | 
					
						
							| 
									
										
											  
											
												fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
    "mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
    "version": [
        {"text": "current: v1.7.2", "start": 7, "end": 21},
        {"text": "supersedes: v1.7.0", "start": 22, "end": 40},
    ],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
    Title(
        "Lorem Ipsum",
        metadata=ElementMetadata(
            regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
        ),
    ),
    Text(
        "Lorem ipsum dolor sit amet consectetur adipiscing elit.",
        metadata=ElementMetadata(
            regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
        ),
    ),
    Text(
        "In rhoncus ipsum sed lectus porta volutpat.",
        metadata=ElementMetadata(
            regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
        ),
    ),
]
chunks = chunk_by_title(elements)
assert chunks == [
    CompositeElement(
        "Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
        " ipsum sed lectus porta volutpat."
    )
]
```
Observed behavior looked like this:
```python
chunks => [
    CompositeElement('Lorem Ipsum')
    CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
    CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
    Title(
        "Lorem Ipsum",
        metadata=ElementMetadata(
            regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
        ),
    ),
    Text(
        "Lorem ipsum dolor sit amet consectetur adipiscing elit.",
        metadata=ElementMetadata(
            regex_metadata={
                "dolor": [RegexMetadata(text="dolor", start=12, end=17)],
                "ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
            }
        ),
    ),
    Text(
        "In rhoncus ipsum sed lectus porta volutpat.",
        metadata=ElementMetadata(
            regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
        ),
    ),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
    "Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
    " ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
    "dolor": [RegexMetadata(text="dolor", start=25, end=30)],
    "ipsum": [
        RegexMetadata(text="Ipsum", start=6, end=11),
        RegexMetadata(text="ipsum", start=19, end=24),
        RegexMetadata(text="ipsum", start=81, end=86),
    ],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
											
										 
											2023-10-19 20:16:02 -07:00
										 |  |  | ) | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-10 09:39:34 -07:00
										 |  |  | for test in "${all_tests[@]}"; do | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   CURRENT_TEST="$test" | 
					
						
							|  |  |  |   # IF: python_version is not 3.10 (wildcarded to match any subminor version) AND the current test is not in full_python_matrix_tests | 
					
						
							|  |  |  |   # Note: to test we expand the full_python_matrix_tests array to a string and then regex match the current test | 
					
						
							|  |  |  |   if [[ "$python_version" != "Python 3.10"* ]] && [[ ! "${full_python_matrix_tests[*]}" =~ $test ]]; then | 
					
						
							|  |  |  |     echo "--------- SKIPPING SCRIPT $test ---------" | 
					
						
							|  |  |  |     continue | 
					
						
							|  |  |  |   fi | 
					
						
							|  |  |  |   echo "--------- RUNNING SCRIPT $test ---------" | 
					
						
							|  |  |  |   echo "Running ./test_unstructured_ingest/$test" | 
					
						
							|  |  |  |   ./test_unstructured_ingest/src/"$test" | 
					
						
							|  |  |  |   rc=$? | 
					
						
							|  |  |  |   if [[ $rc -eq 8 ]]; then | 
					
						
							|  |  |  |     echo "$test (skipped due to missing env var)" | tee -a "$SKIPPED_FILES_LOG" | 
					
						
							|  |  |  |   elif [[ "${tests_to_ignore[*]}" =~ $test ]]; then | 
					
						
							|  |  |  |     echo "$test (skipped checking error code: $rc)" | tee -a "$SKIPPED_FILES_LOG" | 
					
						
							|  |  |  |     continue | 
					
						
							|  |  |  |   elif [[ $rc -ne 0 ]]; then | 
					
						
							|  |  |  |     exit $rc | 
					
						
							|  |  |  |   fi | 
					
						
							|  |  |  |   echo "--------- FINISHED SCRIPT $test ---------" | 
					
						
							| 
									
										
										
										
											2023-09-21 14:51:08 -04:00
										 |  |  | done | 
					
						
							| 
									
										
										
										
											2023-10-23 17:39:22 -04:00
										 |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-11-29 08:41:19 -05:00
										 |  |  | set +e | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2023-10-27 00:36:36 -04:00
										 |  |  | all_eval=( | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   'text-extraction' | 
					
						
							|  |  |  |   'element-type' | 
					
						
							| 
									
										
										
										
											2023-10-27 00:36:36 -04:00
										 |  |  | ) | 
					
						
							|  |  |  | for eval in "${all_eval[@]}"; do | 
					
						
							| 
									
										
										
										
											2023-12-18 23:48:21 -08:00
										 |  |  |   CURRENT_TEST="evaluation-metrics.sh $eval" | 
					
						
							|  |  |  |   echo "--------- RUNNING SCRIPT evaluation-metrics.sh $eval ---------" | 
					
						
							|  |  |  |   ./test_unstructured_ingest/evaluation-metrics.sh "$eval" | 
					
						
							|  |  |  |   echo "--------- FINISHED SCRIPT evaluation-metrics.sh $eval ---------" | 
					
						
							| 
									
										
										
										
											2023-10-27 14:07:00 +01:00
										 |  |  | done |