Bulk insert in sql document stores (#2264)

* bulk_save_objects

* use bulk_insert_mappings in sql document store

* bug fix : write_document overwrite

* update documentation write_documents

* Update Documentation & Code Style

* small changes for readability

* bug fix: missing quotation marks

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
This commit is contained in:
OmniScience 2022-03-09 12:11:26 +01:00 committed by GitHub
parent a97a9d2b48
commit 004e7f33af
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 11 deletions

View File

@ -1790,7 +1790,8 @@ documents for evaluation can be indexed in a separate index than the documents f
- `duplicate_documents`: Handle duplicates document based on parameter options.
Parameter options : ( 'skip','overwrite','fail')
skip: Ignore the duplicates documents
overwrite: Update any existing documents with the same ID when adding documents.
overwrite: Update any existing documents with the same ID when adding documents
but is considerably slower (default).
fail: an error is raised if the document ID of the document being added already
exists.

View File

@ -369,7 +369,8 @@ class SQLDocumentStore(BaseDocumentStore):
:param duplicate_documents: Handle duplicates document based on parameter options.
Parameter options : ( 'skip','overwrite','fail')
skip: Ignore the duplicates documents
overwrite: Update any existing documents with the same ID when adding documents.
overwrite: Update any existing documents with the same ID when adding documents
but is considerably slower (default).
fail: an error is raised if the document ID of the document being added already
exists.
@ -392,24 +393,30 @@ class SQLDocumentStore(BaseDocumentStore):
documents=document_objects, index=index, duplicate_documents=duplicate_documents
)
for i in range(0, len(document_objects), batch_size):
docs_orm = []
for doc in document_objects[i : i + batch_size]:
meta_fields = doc.meta or {}
vector_id = meta_fields.pop("vector_id", None)
meta_orms = [MetaDocumentORM(name=key, value=value) for key, value in meta_fields.items()]
doc_orm = DocumentORM(
id=doc.id,
content=doc.to_dict()["content"],
content_type=doc.content_type,
vector_id=vector_id,
meta=meta_orms,
index=index,
)
doc_mapping = {
"id": doc.id,
"content": doc.to_dict()["content"],
"content_type": doc.content_type,
"vector_id": vector_id,
"meta": meta_orms,
"index": index,
}
if duplicate_documents == "overwrite":
doc_orm = DocumentORM(**doc_mapping)
# First old meta data cleaning is required
self.session.query(MetaDocumentORM).filter_by(document_id=doc.id).delete()
self.session.merge(doc_orm)
else:
self.session.add(doc_orm)
docs_orm.append(doc_mapping)
if docs_orm:
self.session.bulk_insert_mappings(DocumentORM, docs_orm)
try:
self.session.commit()
except Exception as ex: