2023-02-14 12:27:45 -08:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
2023-11-29 08:41:19 -05:00
|
|
|
set -u -o pipefail
|
2023-02-14 12:27:45 -08:00
|
|
|
|
2023-12-11 20:04:15 -05:00
|
|
|
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
2023-11-29 08:41:19 -05:00
|
|
|
SKIPPED_FILES_LOG=$SCRIPT_DIR/skipped-files.txt
|
|
|
|
# If the file already exists, reset it
|
|
|
|
if [ -f "$SKIPPED_FILES_LOG" ]; then
|
2023-12-18 23:48:21 -08:00
|
|
|
rm "$SKIPPED_FILES_LOG"
|
2023-11-29 08:41:19 -05:00
|
|
|
fi
|
2023-11-29 13:31:59 -05:00
|
|
|
touch "$SKIPPED_FILES_LOG"
|
2023-02-21 10:15:33 -08:00
|
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
2023-02-16 08:45:50 -08:00
|
|
|
|
2023-04-11 00:11:50 -07:00
|
|
|
# NOTE(crag): sets number of tesseract threads to 1 which may help with more reproducible outputs
|
|
|
|
export OMP_THREAD_LIMIT=1
|
|
|
|
|
2023-10-10 09:39:34 -07:00
|
|
|
all_tests=(
|
2023-12-18 23:48:21 -08:00
|
|
|
's3.sh'
|
|
|
|
's3-minio.sh'
|
|
|
|
'azure.sh'
|
|
|
|
'biomed-api.sh'
|
|
|
|
'biomed-path.sh'
|
|
|
|
# NOTE(yuming): The pdf-fast-reprocess test should be put after any tests that save downloaded files
|
|
|
|
'pdf-fast-reprocess.sh'
|
|
|
|
'salesforce.sh'
|
|
|
|
'box.sh'
|
|
|
|
'discord.sh'
|
|
|
|
'dropbox.sh'
|
|
|
|
'github.sh'
|
|
|
|
'gitlab.sh'
|
|
|
|
'google-drive.sh'
|
|
|
|
'wikipedia.sh'
|
|
|
|
'local.sh'
|
|
|
|
'slack.sh'
|
|
|
|
'against-api.sh'
|
|
|
|
'gcs.sh'
|
|
|
|
'onedrive.sh'
|
|
|
|
'outlook.sh'
|
|
|
|
'elasticsearch.sh'
|
|
|
|
'confluence-diff.sh'
|
|
|
|
'confluence-large.sh'
|
|
|
|
'airtable-diff.sh'
|
|
|
|
# NOTE(ryan): This test is disabled because it is triggering too many requests to the API
|
|
|
|
# 'airtable-large.sh'
|
|
|
|
'local-single-file.sh'
|
|
|
|
'local-single-file-with-encoding.sh'
|
|
|
|
'local-single-file-with-pdf-infer-table-structure.sh'
|
|
|
|
'notion.sh'
|
|
|
|
'delta-table.sh'
|
|
|
|
'jira.sh'
|
|
|
|
'sharepoint.sh'
|
|
|
|
'sharepoint-with-permissions.sh'
|
|
|
|
'hubspot.sh'
|
|
|
|
'local-embed.sh'
|
|
|
|
'sftp.sh'
|
2023-09-21 14:51:08 -04:00
|
|
|
)
|
|
|
|
|
2023-10-10 09:39:34 -07:00
|
|
|
full_python_matrix_tests=(
|
2023-12-18 23:48:21 -08:00
|
|
|
'sharepoint.sh'
|
|
|
|
'local.sh'
|
|
|
|
'local-single-file.sh'
|
|
|
|
'local-single-file-with-encoding.sh'
|
|
|
|
'local-single-file-with-pdf-infer-table-structure.sh'
|
|
|
|
's3.sh'
|
|
|
|
'google-drive.sh'
|
|
|
|
'gcs.sh'
|
|
|
|
'azure.sh'
|
2023-10-10 09:39:34 -07:00
|
|
|
)
|
|
|
|
|
|
|
|
CURRENT_TEST="none"
|
2023-09-21 14:51:08 -04:00
|
|
|
|
|
|
|
function print_last_run() {
|
2023-12-18 23:48:21 -08:00
|
|
|
if [ "$CURRENT_TEST" != "none" ]; then
|
|
|
|
echo "Last ran script: $CURRENT_TEST"
|
|
|
|
fi
|
|
|
|
echo "######## SKIPPED TESTS: ########"
|
|
|
|
cat "$SKIPPED_FILES_LOG"
|
2023-09-21 14:51:08 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
trap print_last_run EXIT
|
|
|
|
|
2023-10-10 09:39:34 -07:00
|
|
|
python_version=$(python --version 2>&1)
|
|
|
|
|
fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
"mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
"version": [
{"text": "current: v1.7.2", "start": 7, "end": 21},
{"text": "supersedes: v1.7.0", "start": 22, "end": 40},
],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
]
```
Observed behavior looked like this:
```python
chunks => [
CompositeElement('Lorem Ipsum')
CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
2023-10-19 20:16:02 -07:00
|
|
|
tests_to_ignore=(
|
2023-12-18 23:48:21 -08:00
|
|
|
'notion.sh'
|
|
|
|
'dropbox.sh'
|
fix: chunks break on regex-meta changes and regex-meta start/stop not adjusted (#1779)
**Executive Summary.** Introducing strict type-checking as preparation
for adding the chunk-overlap feature revealed a type mismatch for
regex-metadata between chunking tests and the (authoritative)
ElementMetadata definition. The implementation of regex-metadata aspects
of chunking passed the tests but did not produce the appropriate
behaviors in production where the actual data-structure was different.
This PR fixes these two bugs.
1. **Over-chunking.** The presence of `regex-metadata` in an element was
incorrectly being interpreted as a semantic boundary, leading to such
elements being isolated in their own chunks.
2. **Discarded regex-metadata.** regex-metadata present on the second or
later elements in a section (chunk) was discarded.
**Technical Summary**
The type of `ElementMetadata.regex_metadata` is `Dict[str,
List[RegexMetadata]]`. `RegexMetadata` is a `TypedDict` like `{"text":
"this matched", "start": 7, "end": 19}`.
Multiple regexes can be specified, each with a name like "mail-stop",
"version", etc. Each of those may produce its own set of matches, like:
```python
>>> element.regex_metadata
{
"mail-stop": [{"text": "MS-107", "start": 18, "end": 24}],
"version": [
{"text": "current: v1.7.2", "start": 7, "end": 21},
{"text": "supersedes: v1.7.0", "start": 22, "end": 40},
],
}
```
*Forensic analysis*
* The regex-metadata feature was added by Matt Robinson on 06/16/2023
commit: 4ea71683. The regex_metadata data structure is the same as when
it was added.
* The chunk-by-title feature was added by Matt Robinson on 08/29/2023
commit: f6a745a7. The mistaken regex-metadata data structure in the
tests is present in that commit.
Looks to me like a mis-remembering of the regex-metadata data-structure
and insufficient type-checking rigor (type-checker strictness level set
too low) to warn of the mistake.
**Over-chunking Behavior**
The over-chunking looked like this:
Chunking three elements with regex metadata should combine them into a
single chunk (`CompositeElement` object), subject to maximum size rules
(default 500 chars).
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={"dolor": [RegexMetadata(text="dolor", start=12, end=17)]}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
chunks = chunk_by_title(elements)
assert chunks == [
CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
]
```
Observed behavior looked like this:
```python
chunks => [
CompositeElement('Lorem Ipsum')
CompositeElement('Lorem ipsum dolor sit amet consectetur adipiscing elit.')
CompositeElement('In rhoncus ipsum sed lectus porta volutpat.')
]
```
The fix changed the approach from breaking on any metadata field not in
a specified group (`regex_metadata` was missing from this group) to only
breaking on specified fields (whitelisting instead of blacklisting).
This avoids overchunking every time we add a new metadata field and is
also simpler and easier to understand. This change in approach is
discussed in more detail here #1790.
**Dropping regex-metadata Behavior**
Chunking this section:
```python
elements: List[Element] = [
Title(
"Lorem Ipsum",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="Ipsum", start=6, end=11)]}
),
),
Text(
"Lorem ipsum dolor sit amet consectetur adipiscing elit.",
metadata=ElementMetadata(
regex_metadata={
"dolor": [RegexMetadata(text="dolor", start=12, end=17)],
"ipsum": [RegexMetadata(text="ipsum", start=6, end=11)],
}
),
),
Text(
"In rhoncus ipsum sed lectus porta volutpat.",
metadata=ElementMetadata(
regex_metadata={"ipsum": [RegexMetadata(text="ipsum", start=11, end=16)]}
),
),
]
```
..should produce this regex_metadata on the single produced chunk:
```python
assert chunk == CompositeElement(
"Lorem Ipsum\n\nLorem ipsum dolor sit amet consectetur adipiscing elit.\n\nIn rhoncus"
" ipsum sed lectus porta volutpat."
)
assert chunk.metadata.regex_metadata == {
"dolor": [RegexMetadata(text="dolor", start=25, end=30)],
"ipsum": [
RegexMetadata(text="Ipsum", start=6, end=11),
RegexMetadata(text="ipsum", start=19, end=24),
RegexMetadata(text="ipsum", start=81, end=86),
],
}
```
but instead produced this:
```python
regex_metadata == {"ipsum": [{"text": "Ipsum", "start": 6, "end": 11}]}
```
Which is the regex-metadata from the first element only.
The fix was to remove the consolidation+adjustment process from inside
the "list-attribute-processing" loop (because regex-metadata is not a
list) and process regex metadata separately.
2023-10-19 20:16:02 -07:00
|
|
|
)
|
|
|
|
|
2023-10-10 09:39:34 -07:00
|
|
|
for test in "${all_tests[@]}"; do
|
2023-12-18 23:48:21 -08:00
|
|
|
CURRENT_TEST="$test"
|
|
|
|
# IF: python_version is not 3.10 (wildcarded to match any subminor version) AND the current test is not in full_python_matrix_tests
|
|
|
|
# Note: to test we expand the full_python_matrix_tests array to a string and then regex match the current test
|
|
|
|
if [[ "$python_version" != "Python 3.10"* ]] && [[ ! "${full_python_matrix_tests[*]}" =~ $test ]]; then
|
|
|
|
echo "--------- SKIPPING SCRIPT $test ---------"
|
|
|
|
continue
|
|
|
|
fi
|
|
|
|
echo "--------- RUNNING SCRIPT $test ---------"
|
|
|
|
echo "Running ./test_unstructured_ingest/$test"
|
|
|
|
./test_unstructured_ingest/src/"$test"
|
|
|
|
rc=$?
|
|
|
|
if [[ $rc -eq 8 ]]; then
|
|
|
|
echo "$test (skipped due to missing env var)" | tee -a "$SKIPPED_FILES_LOG"
|
|
|
|
elif [[ "${tests_to_ignore[*]}" =~ $test ]]; then
|
|
|
|
echo "$test (skipped checking error code: $rc)" | tee -a "$SKIPPED_FILES_LOG"
|
|
|
|
continue
|
|
|
|
elif [[ $rc -ne 0 ]]; then
|
|
|
|
exit $rc
|
|
|
|
fi
|
|
|
|
echo "--------- FINISHED SCRIPT $test ---------"
|
2023-09-21 14:51:08 -04:00
|
|
|
done
|
2023-10-23 17:39:22 -04:00
|
|
|
|
2023-11-29 08:41:19 -05:00
|
|
|
set +e
|
|
|
|
|
2023-10-27 00:36:36 -04:00
|
|
|
all_eval=(
|
2023-12-18 23:48:21 -08:00
|
|
|
'text-extraction'
|
|
|
|
'element-type'
|
2023-10-27 00:36:36 -04:00
|
|
|
)
|
|
|
|
for eval in "${all_eval[@]}"; do
|
2023-12-18 23:48:21 -08:00
|
|
|
CURRENT_TEST="evaluation-metrics.sh $eval"
|
|
|
|
echo "--------- RUNNING SCRIPT evaluation-metrics.sh $eval ---------"
|
|
|
|
./test_unstructured_ingest/evaluation-metrics.sh "$eval"
|
|
|
|
echo "--------- FINISHED SCRIPT evaluation-metrics.sh $eval ---------"
|
2023-10-27 14:07:00 +01:00
|
|
|
done
|