Removing (deprecation) warnings (#530)

1. Few warnings need fix in FARM 2. Can't remove warning from docx library.
2025-12-29 07:59:27 +00:00 · 2020-11-02 15:18:43 +01:00 · 2020-11-02 15:18:43 +01:00 · 5d45992c84
commit 5d45992c84
parent f5419163e7
4 changed files with 9 additions and 8 deletions
--- a/haystack/document_store/elasticsearch.py
+++ b/haystack/document_store/elasticsearch.py
@ -304,7 +304,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):

    def update_document_meta(self, id: str, meta: Dict[str, str]):
        body = {"doc": meta}
-        self.client.update(index=self.index, doc_type="_doc", id=id, body=body, refresh=self.refresh_type)
+        self.client.update(index=self.index, id=id, body=body, refresh=self.refresh_type)

    def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
        index = index or self.index
--- a/haystack/preprocessor/utils.py
+++ b/haystack/preprocessor/utils.py
@ -196,12 +196,14 @@ def tika_convert_files_to_dicts(
                    last_para = ''
                    for para in paras:
                        para = para.strip()
-                        if not para: continue
+                        if not para:
+                            continue
                        # merge paragraphs to improve qa
                        # merge this paragraph if less than 10 characters or 2 words
                        # or this paragraph starts with a lower case and last paragraph does not end with a punctuation
-                        if merge_short and len(para) < 10 or len(re.findall('\s+', para)) < 2 \
-                            or merge_lowercase and para and para[0].islower() and last_para and last_para[-1] not in '.?!"\'\]\)':
+                        if merge_short and len(para) < 10 or len(re.findall(r'\s+', para)) < 2 \
+                                or merge_lowercase and para and para[0].islower() and last_para \
+                                and last_para[-1] not in r'.?!"\'\]\)':
                            last_para += ' ' + para
                        else:
                            if last_para:
--- a/haystack/reader/farm.py
+++ b/haystack/reader/farm.py
@ -338,6 +338,7 @@ class FARMReader(BaseReader):
            inputs.append(cur)

        # get answers from QA model
+        # TODO: Need fix in FARM's `to_dict` function of `QAInput` class
        predictions = self.inferencer.inference_from_objects(
            objects=inputs, return_json=False, multiprocessing_chunksize=1
        )
--- a/haystack/retriever/dense.py
+++ b/haystack/retriever/dense.py
@ -5,13 +5,9 @@ import numpy as np
 from pathlib import Path
 from tqdm import tqdm

-from farm.infer import Inferencer
-
 from haystack.document_store.base import BaseDocumentStore
 from haystack import Document
-from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
 from haystack.retriever.base import BaseRetriever
-from haystack.retriever.sparse import logger

 from farm.infer import Inferencer
 from farm.modeling.tokenization import Tokenizer
@ -374,6 +370,8 @@ class EmbeddingRetriever(BaseRetriever):
        assert type(texts) == list, "Expecting a list of texts, i.e. create_embeddings(texts=['text1',...])"

        if self.model_format == "farm" or self.model_format == "transformers":
+            # TODO: FARM's `sample_to_features_text` need to fix following warning -
+            # tokenization_utils.py:460: FutureWarning: `is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.
            emb = self.embedding_model.inference_from_dicts(dicts=[{"text": t} for t in texts])  # type: ignore
            emb = [(r["vec"]) for r in emb]
        elif self.model_format == "sentence_transformers":