mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-11-02 02:53:31 +00:00
**Summary** Use more sophisticated algorithm for splitting oversized `Table` elements into `TableChunk` elements during chunking to ensure element text and HTML are "synchronized" and HTML is always parseable. **Additional Context** Table splitting now has the following characteristics: - `TableChunk.metadata.text_as_html` is always a parseable HTML `<table>` subtree. - `TableChunk.text` is always the text in the HTML version of the table fragment in `.metadata.text_as_html`. Text and HTML are "synchronized". - The table is divided at a whole-row boundary whenever possible. - A row is broken at an even-cell boundary when a single row is larger than the chunking window. - A cell is broken at an even-word boundary when a single cell is larger than the chunking window. - `.text_as_html` is "minified", removing all extraneous whitespace and unneeded elements or attributes. This maximizes the semantic "density" of each chunk.
135 lines
4.2 KiB
Bash
Executable File
135 lines
4.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
set -e
|
|
|
|
DEST_PATH=$(dirname "$(realpath "$0")")
|
|
SCRIPT_DIR=$(dirname "$DEST_PATH")
|
|
cd "$SCRIPT_DIR"/.. || exit 1
|
|
OUTPUT_FOLDER_NAME=s3-pinecone-dest
|
|
OUTPUT_DIR=$SCRIPT_DIR/structured-output/$OUTPUT_FOLDER_NAME
|
|
WORK_DIR=$SCRIPT_DIR/workdir/$OUTPUT_FOLDER_NAME
|
|
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
|
|
writer_processes=$(((max_processes - 1) > 1 ? (max_processes - 1) : 2))
|
|
|
|
if [ -z "$PINECONE_API_KEY" ]; then
|
|
echo "Skipping Pinecone ingest test because PINECONE_API_KEY env var is not set."
|
|
exit 0
|
|
fi
|
|
|
|
RANDOM_SUFFIX=$((RANDOM % 100000 + 1))
|
|
|
|
# Set the variables with default values if they're not set in the environment
|
|
PINECONE_INDEX=${PINECONE_INDEX:-"ingest-test-$RANDOM_SUFFIX"}
|
|
PINECONE_HOST_POSTFIX=${PINECONE_HOST_POSTFIX:-"4627-b74a"}
|
|
PINECONE_ENVIRONMENT=${PINECONE_ENVIRONMENT:-"us-east1-gcp"}
|
|
PINECONE_PROJECT_ID=${PINECONE_PROJECT_ID:-"art8iaj"}
|
|
|
|
# shellcheck disable=SC1091
|
|
source "$SCRIPT_DIR"/cleanup.sh
|
|
function cleanup {
|
|
|
|
# Get response code to check if index exists
|
|
response_code=$(curl \
|
|
-s -o /dev/null \
|
|
-w "%{http_code}" \
|
|
--request GET \
|
|
--url "https://api.pinecone.io/indexes/$PINECONE_INDEX" \
|
|
--header 'accept: application/json' \
|
|
--header "Api-Key: $PINECONE_API_KEY")
|
|
|
|
# Cleanup (delete) index if it exists
|
|
if [ "$response_code" == "200" ]; then
|
|
echo ""
|
|
echo "deleting index $PINECONE_INDEX"
|
|
curl --request DELETE \
|
|
"https://api.pinecone.io/indexes/$PINECONE_INDEX" \
|
|
--header "Api-Key: $PINECONE_API_KEY" \
|
|
--header 'content-type: application/json'
|
|
|
|
else
|
|
echo "There was an error during index deletion for index $PINECONE_INDEX, with response code: $response_code. It might be that index $PINECONE_INDEX does not exist, so there is nothing to delete."
|
|
fi
|
|
|
|
# Local file cleanup
|
|
cleanup_dir "$WORK_DIR"
|
|
cleanup_dir "$OUTPUT_DIR"
|
|
}
|
|
|
|
trap cleanup EXIT
|
|
|
|
echo "Creating index $PINECONE_INDEX"
|
|
response_code=$(curl \
|
|
-s -o /dev/null \
|
|
-w "%{http_code}" \
|
|
--request POST \
|
|
--url "https://api.pinecone.io/indexes" \
|
|
--header "accept: application/json" \
|
|
--header "content-type: application/json" \
|
|
--header "Api-Key: $PINECONE_API_KEY" \
|
|
--data '
|
|
{
|
|
"name": "'"$PINECONE_INDEX"'",
|
|
"dimension": 384,
|
|
"metric": "cosine",
|
|
"spec": {
|
|
"serverless": {
|
|
"cloud": "aws",
|
|
"region": "us-east-1"
|
|
}
|
|
}
|
|
}
|
|
')
|
|
|
|
if [ "$response_code" -lt 400 ]; then
|
|
echo "Index creation success: $response_code"
|
|
else
|
|
echo "Index creation failure: $response_code"
|
|
exit 1
|
|
fi
|
|
|
|
PYTHONPATH=. ./unstructured/ingest/main.py \
|
|
local \
|
|
--num-processes "$max_processes" \
|
|
--output-dir "$OUTPUT_DIR" \
|
|
--strategy fast \
|
|
--verbose \
|
|
--reprocess \
|
|
--input-path example-docs/book-war-and-peace-1225p.txt \
|
|
--work-dir "$WORK_DIR" \
|
|
--chunking-strategy by_title \
|
|
--chunk-combine-text-under-n-chars 150 --chunk-new-after-n-chars 1500 --chunk-max-characters 2500 --chunk-multipage-sections \
|
|
--embedding-provider "langchain-huggingface" \
|
|
pinecone \
|
|
--api-key "$PINECONE_API_KEY" \
|
|
--index-name "$PINECONE_INDEX" \
|
|
--environment "$PINECONE_ENVIRONMENT" \
|
|
--batch-size 80 \
|
|
--num-processes "$writer_processes"
|
|
|
|
# It can take some time for the index to catch up with the content that was written, this check between 10s sleeps
|
|
# to give it that time process the writes. Will timeout after checking for a minute.
|
|
num_of_vectors_remote=0
|
|
attempt=1
|
|
sleep_amount=30
|
|
while [ "$num_of_vectors_remote" -eq 0 ] && [ "$attempt" -lt 4 ]; do
|
|
echo "attempt $attempt: sleeping $sleep_amount seconds to let index finish catching up after writes"
|
|
sleep $sleep_amount
|
|
|
|
num_of_vectors_remote=$(curl --request POST \
|
|
-s \
|
|
--url "https://$PINECONE_INDEX-$PINECONE_PROJECT_ID.svc.aped-$PINECONE_HOST_POSTFIX.pinecone.io/describe_index_stats" \
|
|
--header "accept: application/json" \
|
|
--header "content-type: application/json" \
|
|
--header "Api-Key: $PINECONE_API_KEY" | jq -r '.totalVectorCount')
|
|
|
|
echo "vector count in Pinecone: $num_of_vectors_remote"
|
|
attempt=$((attempt + 1))
|
|
done
|
|
|
|
EXPECTED=1825
|
|
|
|
if [ "$num_of_vectors_remote" -ne $EXPECTED ]; then
|
|
echo "Number of vectors in Pinecone are $num_of_vectors_remote when the expected number is $EXPECTED. Test failed."
|
|
exit 1
|
|
fi
|