Steve Canny 086b8d6f8a
rfctr(part): prepare for pluggable auto-partitioners 2 (#3657)
**Summary**
Step 2 in prep for pluggable auto-partitioners, remove `regex_metadata`
field from `ElementMetadata`.

**Additional Context**
- "regex-metadata" was an experimental feature that didn't pan out.
- It's implemented by one of the post-partitioning metadata decorators,
so get rid of it as part of the cleanup before consolidating those
decorators.
2024-09-24 17:33:25 +00:00

415 lines
13 KiB
JSON

{
"class": "Elements",
"invertedIndexConfig": {
"bm25": {
"b": 0.75,
"k1": 1.2
},
"cleanupIntervalSeconds": 60,
"stopwords": {
"additions": null,
"preset": "en",
"removals": null
}
},
"multiTenancyConfig": {
"enabled": false
},
"properties": [
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "element_id",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "text",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "type",
"tokenization": "word"
},
{
"dataType": [
"object"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "metadata",
"nestedProperties": [
{
"dataType": [
"int"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "category_depth"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "parent_id",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "attached_to_filename",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "filetype",
"tokenization": "word"
},
{
"dataType": [
"date"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "last_modified"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "file_directory",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "filename",
"tokenization": "word"
},
{
"dataType": [
"object"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "data_source",
"nestedProperties": [
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "url",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "version",
"tokenization": "word"
},
{
"dataType": [
"date"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "date_created"
},
{
"dataType": [
"date"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "date_modified"
},
{
"dataType": [
"date"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "date_processed"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "record_locator",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "permissions_data",
"tokenization": "word"
}
]
},
{
"dataType": [
"object"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "coordinates",
"nestedProperties": [
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "system",
"tokenization": "word"
},
{
"dataType": [
"number"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "layout_width"
},
{
"dataType": [
"number"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "layout_height"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "points",
"tokenization": "word"
}
]
},
{
"dataType": [
"text[]"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "languages",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "page_number"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "page_name",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "url",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "links",
"tokenization": "word"
},
{
"dataType": [
"text[]"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "link_urls",
"tokenization": "word"
},
{
"dataType": [
"text[]"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "link_texts",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "sent_from",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "sent_to",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "subject",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "section",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "header_footer_type",
"tokenization": "word"
},
{
"dataType": [
"text[]"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "emphasized_text_contents",
"tokenization": "word"
},
{
"dataType": [
"text[]"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "emphasized_text_tags",
"tokenization": "word"
},
{
"dataType": [
"text"
],
"indexFilterable": true,
"indexSearchable": true,
"name": "text_as_html",
"tokenization": "word"
},
{
"dataType": [
"number"
],
"indexFilterable": true,
"indexSearchable": false,
"name": "detection_class_prob"
}
]
}
],
"replicationConfig": {
"factor": 1
},
"shardingConfig": {
"virtualPerPhysical": 128,
"desiredCount": 1,
"actualCount": 1,
"desiredVirtualCount": 128,
"actualVirtualCount": 128,
"key": "_id",
"strategy": "hash",
"function": "murmur3"
},
"vectorIndexConfig": {
"skip": false,
"cleanupIntervalSeconds": 300,
"maxConnections": 64,
"efConstruction": 128,
"ef": -1,
"dynamicEfMin": 100,
"dynamicEfMax": 500,
"dynamicEfFactor": 8,
"vectorCacheMaxObjects": 1000000000000,
"flatSearchCutoff": 40000,
"distance": "cosine",
"pq": {
"enabled": false,
"bitCompression": false,
"segments": 0,
"centroids": 256,
"trainingLimit": 100000,
"encoder": {
"type": "kmeans",
"distribution": "log-normal"
}
}
},
"vectorIndexType": "hnsw",
"vectorizer": "none"
}