mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-07-03 23:20:35 +00:00
feat: Add stage_for_weaviate
and schema creation function (#672)
* add weaviate docker compose * added staging brick and tests for weaviate * initial notebook and requirements file * add commentary to weaviate notebook * weaviate readme * update docs * version and change log * install weaviate client * install weaviate; skip for docker * linting, linting, linting * install weaviate client with deps * comments on weaviate client * fix module not found error for docker container * skipped wrong test in docker * fix typos * add in local-inference
This commit is contained in:
parent
cf70c86574
commit
c35fff2972
3
.github/workflows/ci.yml
vendored
3
.github/workflows/ci.yml
vendored
@ -138,6 +138,9 @@ jobs:
|
|||||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||||
tesseract --version
|
tesseract --version
|
||||||
|
# NOTE(robinson) - Installing weaviate-client separately here because the requests
|
||||||
|
# version conflicts with label_studio_sdk
|
||||||
|
pip install weaviate-client
|
||||||
make test
|
make test
|
||||||
make check-coverage
|
make check-coverage
|
||||||
|
|
||||||
|
@ -2,10 +2,12 @@
|
|||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
|
||||||
* Builds from Unstructured base image, built off of Rocky Linux 8.7, this resolves almost all CVE's in the image.
|
|
||||||
|
|
||||||
### Features
|
### Features
|
||||||
|
|
||||||
|
* Add `stage_for_weaviate` to stage `unstructured` outputs for upload to Weaviate, along with
|
||||||
|
a helper function for defining a class to use in Weaviate schemas.
|
||||||
|
* Builds from Unstructured base image, built off of Rocky Linux 8.7, this resolves almost all CVE's in the image.
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|
||||||
## 0.7.0
|
## 0.7.0
|
||||||
|
3
Makefile
3
Makefile
@ -41,6 +41,9 @@ install-nltk-models:
|
|||||||
.PHONY: install-test
|
.PHONY: install-test
|
||||||
install-test:
|
install-test:
|
||||||
python3 -m pip install -r requirements/test.txt
|
python3 -m pip install -r requirements/test.txt
|
||||||
|
# NOTE(robinson) - Installing weaviate-client separately here because the requests
|
||||||
|
# version conflicts with label_studio_sdk
|
||||||
|
python3 -m pip install weaviate-client
|
||||||
|
|
||||||
.PHONY: install-dev
|
.PHONY: install-dev
|
||||||
install-dev:
|
install-dev:
|
||||||
|
@ -1554,6 +1554,58 @@ See the `LabelStudio docs <https://labelstud.io/tags/labels.html>`_ for a full l
|
|||||||
for labels and annotations.
|
for labels and annotations.
|
||||||
|
|
||||||
|
|
||||||
|
``stage_for_weaviate``
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
The ``stage_for_weaviate`` staging function prepares a list of ``Element`` objects for ingestion into
|
||||||
|
the `Weaviate <https://weaviate.io/>`_ vector database. You can create a schema in Weaviate
|
||||||
|
for the `unstructured` outputs using the following workflow:
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.staging.weaviate import create_unstructured_weaviate_class
|
||||||
|
|
||||||
|
import weaviate
|
||||||
|
|
||||||
|
# Change `class_name` if you want the class for unstructured documents in Weaviate
|
||||||
|
# to have a different name
|
||||||
|
unstructured_class = create_unstructured_weaviate_class(class_name="UnstructuredDocument")
|
||||||
|
schema = {"classes": [unstructured_class]}
|
||||||
|
|
||||||
|
client = weaviate.Client("http://localhost:8080")
|
||||||
|
client.schema.create(schema)
|
||||||
|
|
||||||
|
|
||||||
|
Once the schema is created, you can batch upload documents to Weaviate using the following workflow.
|
||||||
|
See the `Weaviate documentation <https://weaviate.io/developers/weaviate>`_ for more details on
|
||||||
|
options for uploading data and querying data once it has been uploaded.
|
||||||
|
|
||||||
|
|
||||||
|
.. code:: python
|
||||||
|
|
||||||
|
from unstructured.partition.pdf import partition_pdf
|
||||||
|
from unstructured.staging.weaviate import stage_for_weaviate
|
||||||
|
|
||||||
|
import weaviate
|
||||||
|
from weaviate.util import generate_uuid5
|
||||||
|
|
||||||
|
|
||||||
|
filename = "example-docs/layout-parser-paper-fast.pdf"
|
||||||
|
elements = partition_pdf(filename=filename, strategy="fast")
|
||||||
|
data_objects = stage_for_weaviate(elements)
|
||||||
|
|
||||||
|
client = weaviate.Client("http://localhost:8080")
|
||||||
|
|
||||||
|
with client.batch(batch_size=10) as batch:
|
||||||
|
for data_object in tqdm.tqdm(data_objects):
|
||||||
|
batch.add_data_object(
|
||||||
|
data_object,
|
||||||
|
unstructured_class_name,
|
||||||
|
uuid=generate_uuid5(data_object),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
``stage_for_baseplate``
|
``stage_for_baseplate``
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
|
@ -75,3 +75,13 @@ the text from each element and their types such as ``NarrativeText`` or ``Title`
|
|||||||
-----------------------------
|
-----------------------------
|
||||||
You can format your JSON or CSV outputs for use with `Prodigy <https://prodi.gy/docs/api-loaders>`_ using the `stage_for_prodigy <https://unstructured-io.github.io/unstructured/bricks.html#stage-for-prodigy>`_ and `stage_csv_for_prodigy <https://unstructured-io.github.io/unstructured/bricks.html#stage-csv-for-prodigy>`_ staging bricks. After running ``stage_for_prodigy`` |
|
You can format your JSON or CSV outputs for use with `Prodigy <https://prodi.gy/docs/api-loaders>`_ using the `stage_for_prodigy <https://unstructured-io.github.io/unstructured/bricks.html#stage-for-prodigy>`_ and `stage_csv_for_prodigy <https://unstructured-io.github.io/unstructured/bricks.html#stage-csv-for-prodigy>`_ staging bricks. After running ``stage_for_prodigy`` |
|
||||||
``stage_csv_for_prodigy``, you can write the results to a ``.json`` | ``.jsonl`` or a ``.csv`` file that is ready to be used with Prodigy. Follow the links for more details on usage.
|
``stage_csv_for_prodigy``, you can write the results to a ``.json`` | ``.jsonl`` or a ``.csv`` file that is ready to be used with Prodigy. Follow the links for more details on usage.
|
||||||
|
|
||||||
|
|
||||||
|
``Integration with Weaviate``
|
||||||
|
-----------------------------
|
||||||
|
`Weaviate <https://weaviate.io/>`_ is an open-source vector database that allows you to store data objects and vector embeddings
|
||||||
|
from a variety of ML models. Storing text and embeddings in a vector database such as Weaviate is a key component of the
|
||||||
|
`emerging LLM tech stack <https://medium.com/@unstructured-io/llms-and-the-emerging-ml-tech-stack-bdb189c8be5c>`_.
|
||||||
|
See the `stage_for_weaviate <https://unstructured-io.github.io/unstructured/bricks.html#stage-for-weaviate>`_ docs for details
|
||||||
|
on how to upload ``unstructured`` outputs to Weaviate. An example notebook is also available
|
||||||
|
`here <https://github.com/Unstructured-IO/unstructured/tree/main/examples/weaviate>`_.
|
||||||
|
8
examples/weaviate/README.md
Normal file
8
examples/weaviate/README.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
## Uploading data to Weaviate with `unstructured`
|
||||||
|
|
||||||
|
The example notebook in this directory shows how to upload documents to Weaviate using the
|
||||||
|
`unstructured` library. To get started with the notebook, use the following steps:
|
||||||
|
|
||||||
|
- Run `pip install -r requirements.txt` to install the requirements.
|
||||||
|
- Run `docker-compose up` to run the Weaviate container.
|
||||||
|
- Run `jupyter-notebook` to start the notebook.
|
20
examples/weaviate/docker-compose.yml
Normal file
20
examples/weaviate/docker-compose.yml
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
version: '3.4'
|
||||||
|
services:
|
||||||
|
weaviate:
|
||||||
|
image: semitechnologies/weaviate:1.19.6
|
||||||
|
restart: on-failure:0
|
||||||
|
ports:
|
||||||
|
- "8080:8080"
|
||||||
|
environment:
|
||||||
|
QUERY_DEFAULTS_LIMIT: 20
|
||||||
|
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
|
||||||
|
PERSISTENCE_DATA_PATH: "./data"
|
||||||
|
DEFAULT_VECTORIZER_MODULE: text2vec-transformers
|
||||||
|
ENABLE_MODULES: text2vec-transformers
|
||||||
|
TRANSFORMERS_INFERENCE_API: http://t2v-transformers:8080
|
||||||
|
CLUSTER_HOSTNAME: 'node1'
|
||||||
|
t2v-transformers:
|
||||||
|
image: semitechnologies/transformers-inference:sentence-transformers-multi-qa-MiniLM-L6-cos-v1
|
||||||
|
environment:
|
||||||
|
ENABLE_CUDA: 0 # set to 1 to enable
|
||||||
|
# NVIDIA_VISIBLE_DEVICES: all # enable if running with CUDA
|
4
examples/weaviate/requirements.txt
Normal file
4
examples/weaviate/requirements.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
jupyter
|
||||||
|
tqdm
|
||||||
|
weaviate-client
|
||||||
|
unstructured[local-inference]
|
215
examples/weaviate/weaviate.ipynb
Normal file
215
examples/weaviate/weaviate.ipynb
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "a3ce962e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Loading Data into Weaviate with `unstructured`\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook shows a basic workflow for uploading document elements into Weaviate using the `unstructured` library. To get started with this notebook, first install the dependencies with `pip install -r requirements.txt` and start the Weaviate docker container with `docker-compose up`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "5d9ffc17",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"\n",
|
||||||
|
"import tqdm\n",
|
||||||
|
"from unstructured.partition.pdf import partition_pdf\n",
|
||||||
|
"from unstructured.staging.weaviate import create_unstructured_weaviate_class, stage_for_weaviate\n",
|
||||||
|
"import weaviate\n",
|
||||||
|
"from weaviate.util import generate_uuid5"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "673715e9",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The first step is to partition the document using the `unstructured` library. In the following example, we partition a PDF with `partition_pdf`. You can also partition over a dozen document types with the `partition` function."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "f9fc0cf9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"filename = \"../../example-docs/layout-parser-paper-fast.pdf\"\n",
|
||||||
|
"elements = partition_pdf(filename=filename, strategy=\"fast\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "3ae76364",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Next, we'll create a schema for our Weaviate database using the `create_unstructured_weaviate_class` helper function from the `unstructured` library. The helper function generates a schema that includes all of the elements in the `ElementMetadata` object from `unstructured`. This includes information such as the filename and the page number of the document element. After specifying the schema, we create a connection to the database with the Weaviate client library and create the schema. You can change the name of the class by updating the `unstructured_class_name` variable."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "91057cb1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"unstructured_class_name = \"UnstructuredDocument\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "78e804bb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"unstructured_class = create_unstructured_weaviate_class(unstructured_class_name)\n",
|
||||||
|
"schema = {\"classes\": [unstructured_class]} "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "3e317a2d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"client = weaviate.Client(\"http://localhost:8080\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "0c508784",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"client.schema.create(schema)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "024ae133",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Next, we stage the elements for Weaviate using the `stage_for_weaviate` function and batch upload the results to Weaviate. `stage_for_weaviate` outputs a dictionary that conforms to the schema we created earlier. Once that data is stage, we can use the Weaviate client library to batch upload the results to Weaviate."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "a7018bb1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data_objects = stage_for_weaviate(elements)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "af712d8e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"100%|██████████████████████████████████████████████████████████████████████| 28/28 [00:46<00:00, 1.66s/it]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"with client.batch(batch_size=10) as batch:\n",
|
||||||
|
" for data_object in tqdm.tqdm(data_objects):\n",
|
||||||
|
" batch.add_data_object(\n",
|
||||||
|
" data_object,\n",
|
||||||
|
" unstructured_class_name,\n",
|
||||||
|
" uuid=generate_uuid5(data_object),\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "dac10bf5",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Now that the documents are in Weaviate, we're able to run queries against Weaviate!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "14098434",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"{\n",
|
||||||
|
" \"data\": {\n",
|
||||||
|
" \"Get\": {\n",
|
||||||
|
" \"UnstructuredDocument\": [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"text\": \"Deep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classi\\ufb01cation [11,\"\n",
|
||||||
|
" }\n",
|
||||||
|
" ]\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
"}\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"near_text = {\"concepts\": [\"document understanding\"]}\n",
|
||||||
|
"\n",
|
||||||
|
"result = (\n",
|
||||||
|
" client.query\n",
|
||||||
|
" .get(\"UnstructuredDocument\", [\"text\"])\n",
|
||||||
|
" .with_near_text(near_text)\n",
|
||||||
|
" .with_limit(1)\n",
|
||||||
|
" .do()\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(json.dumps(result, indent=4))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c191217c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.8.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
54
test_unstructured/staging/test_weaviate.py
Normal file
54
test_unstructured/staging/test_weaviate.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import contextlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# NOTE(robinson) - allows tests that do not require the weaviate client to
|
||||||
|
# run for the docker container
|
||||||
|
with contextlib.suppress(ModuleNotFoundError):
|
||||||
|
from weaviate.schema.validate_schema import validate_schema
|
||||||
|
|
||||||
|
from unstructured.partition.json import partition_json
|
||||||
|
from unstructured.staging.weaviate import (
|
||||||
|
create_unstructured_weaviate_class,
|
||||||
|
stage_for_weaviate,
|
||||||
|
)
|
||||||
|
|
||||||
|
is_in_docker = os.path.exists("/.dockerenv")
|
||||||
|
|
||||||
|
|
||||||
|
def test_stage_for_weaviate(filename="example-docs/layout-parser-paper-fast.pdf"):
|
||||||
|
element_dict = {
|
||||||
|
"element_id": "015301d4f56aa4b20ec10ac889d2343f",
|
||||||
|
"coordinates": (
|
||||||
|
(157.62199999999999, 114.23496279999995),
|
||||||
|
(157.62199999999999, 146.5141628),
|
||||||
|
(457.7358962799999, 146.5141628),
|
||||||
|
(457.7358962799999, 114.23496279999995),
|
||||||
|
),
|
||||||
|
"text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis",
|
||||||
|
"type": "Title",
|
||||||
|
"metadata": {
|
||||||
|
"filename": "layout-parser-paper-fast.pdf",
|
||||||
|
"filetype": "application/json",
|
||||||
|
"page_number": 1,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
elements = partition_json(text=json.dumps([element_dict]))
|
||||||
|
data = stage_for_weaviate(elements)
|
||||||
|
assert data[0] == {
|
||||||
|
"filename": "layout-parser-paper-fast.pdf",
|
||||||
|
"filetype": "application/json",
|
||||||
|
"page_number": 1,
|
||||||
|
"text": "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis",
|
||||||
|
"category": "Title",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||||
|
def test_weaviate_schema_is_valid():
|
||||||
|
unstructured_class = create_unstructured_weaviate_class()
|
||||||
|
schema = {"classes": [unstructured_class]}
|
||||||
|
validate_schema(schema)
|
81
unstructured/staging/weaviate.py
Normal file
81
unstructured/staging/weaviate.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
from typing import Any, Dict, List, TypedDict
|
||||||
|
|
||||||
|
from unstructured.documents.elements import ElementMetadata, Text
|
||||||
|
|
||||||
|
|
||||||
|
class Properties(TypedDict):
|
||||||
|
name: str
|
||||||
|
dataType: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
def stage_for_weaviate(elements: List[Text]) -> List[Dict[str, Any]]:
|
||||||
|
"""Converts a list of elements into a list of dictionaries that can be uploaded to
|
||||||
|
Weaviate. The outputs will conform to the schema created with
|
||||||
|
create_unstructured_weaviate_class.
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
https://weaviate.io/developers/weaviate/tutorials/import#batch-import-process
|
||||||
|
"""
|
||||||
|
data: List[Dict[str, Any]] = []
|
||||||
|
for element in elements:
|
||||||
|
properties = element.metadata.to_dict()
|
||||||
|
properties["text"] = element.text
|
||||||
|
properties["category"] = element.category
|
||||||
|
data.append(properties)
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
|
def create_unstructured_weaviate_class(class_name: str = "UnstructuredDocument"):
|
||||||
|
"""Creates a Weaviate schema class for Unstructured documents using the information
|
||||||
|
available in ElementMetadata.
|
||||||
|
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
class_name: str
|
||||||
|
The name to use for the Unstructured class in the schema.
|
||||||
|
Defaults to "UnstructuredDocument".
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
https://weaviate.io/developers/weaviate/client-libraries/python#manual-batching
|
||||||
|
"""
|
||||||
|
properties: List[Properties] = [
|
||||||
|
{
|
||||||
|
"name": "text",
|
||||||
|
"dataType": ["text"],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "category",
|
||||||
|
"dataType": ["text"],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
for name, annotation in ElementMetadata.__annotations__.items():
|
||||||
|
data_type = _annotation_to_weaviate_data_type(annotation)
|
||||||
|
properties.append(
|
||||||
|
{
|
||||||
|
"name": name,
|
||||||
|
"dataType": data_type,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
class_dict = {
|
||||||
|
"class": class_name,
|
||||||
|
"properties": properties,
|
||||||
|
}
|
||||||
|
|
||||||
|
return class_dict
|
||||||
|
|
||||||
|
|
||||||
|
def _annotation_to_weaviate_data_type(annotation: str):
|
||||||
|
if "str" in annotation:
|
||||||
|
return ["text"]
|
||||||
|
elif "int" in annotation:
|
||||||
|
return ["int"]
|
||||||
|
elif "date" in annotation:
|
||||||
|
return ["date"]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Annotation {annotation} does not map to a Weaviate dataType.")
|
Loading…
x
Reference in New Issue
Block a user