Bulk insert in sql document stores (#2264)

* bulk_save_objects * use bulk_insert_mappings in sql document store * bug fix : write_document overwrite * update documentation write_documents * Update Documentation & Code Style * small changes for readability * bug fix: missing quotation marks Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sara Zan <sara.zanzottera@deepset.ai>
2025-11-03 03:09:28 +00:00 · 2022-03-09 12:11:26 +01:00 · 2022-03-09 12:11:26 +01:00 · 004e7f33af
commit 004e7f33af
parent a97a9d2b48
2 changed files with 19 additions and 11 deletions
--- a/docs/_src/api/api/document_store.md
+++ b/docs/_src/api/api/document_store.md
@ -1790,7 +1790,8 @@ documents for evaluation can be indexed in a separate index than the documents f
 - `duplicate_documents`: Handle duplicates document based on parameter options.
 Parameter options : ( 'skip','overwrite','fail')
 skip: Ignore the duplicates documents
-overwrite: Update any existing documents with the same ID when adding documents.
+overwrite: Update any existing documents with the same ID when adding documents
+but is considerably slower (default).
 fail: an error is raised if the document ID of the document being added already
 exists.

--- a/haystack/document_stores/sql.py
+++ b/haystack/document_stores/sql.py
@ -369,7 +369,8 @@ class SQLDocumentStore(BaseDocumentStore):
        :param duplicate_documents: Handle duplicates document based on parameter options.
                                    Parameter options : ( 'skip','overwrite','fail')
                                    skip: Ignore the duplicates documents
-                                    overwrite: Update any existing documents with the same ID when adding documents.
+                                    overwrite: Update any existing documents with the same ID when adding documents
+                                    but is considerably slower (default).
                                    fail: an error is raised if the document ID of the document being added already
                                    exists.

@ -392,24 +393,30 @@ class SQLDocumentStore(BaseDocumentStore):
            documents=document_objects, index=index, duplicate_documents=duplicate_documents
        )
        for i in range(0, len(document_objects), batch_size):
+            docs_orm = []
            for doc in document_objects[i : i + batch_size]:
                meta_fields = doc.meta or {}
                vector_id = meta_fields.pop("vector_id", None)
                meta_orms = [MetaDocumentORM(name=key, value=value) for key, value in meta_fields.items()]
-                doc_orm = DocumentORM(
-                    id=doc.id,
-                    content=doc.to_dict()["content"],
-                    content_type=doc.content_type,
-                    vector_id=vector_id,
-                    meta=meta_orms,
-                    index=index,
-                )
+                doc_mapping = {
+                    "id": doc.id,
+                    "content": doc.to_dict()["content"],
+                    "content_type": doc.content_type,
+                    "vector_id": vector_id,
+                    "meta": meta_orms,
+                    "index": index,
+                }
                if duplicate_documents == "overwrite":
+                    doc_orm = DocumentORM(**doc_mapping)
                    # First old meta data cleaning is required
                    self.session.query(MetaDocumentORM).filter_by(document_id=doc.id).delete()
                    self.session.merge(doc_orm)
                else:
-                    self.session.add(doc_orm)
+                    docs_orm.append(doc_mapping)
+
+            if docs_orm:
+                self.session.bulk_insert_mappings(DocumentORM, docs_orm)
+
            try:
                self.session.commit()
            except Exception as ex: