Removing (deprecation) warnings (#530)

1. Few warnings need fix in FARM
2. Can't remove warning from docx library.
This commit is contained in:
Lalit Pagaria 2020-11-02 15:18:43 +01:00 committed by GitHub
parent f5419163e7
commit 5d45992c84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 9 additions and 8 deletions

View File

@ -304,7 +304,7 @@ class ElasticsearchDocumentStore(BaseDocumentStore):
def update_document_meta(self, id: str, meta: Dict[str, str]):
body = {"doc": meta}
self.client.update(index=self.index, doc_type="_doc", id=id, body=body, refresh=self.refresh_type)
self.client.update(index=self.index, id=id, body=body, refresh=self.refresh_type)
def get_document_count(self, filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int:
index = index or self.index

View File

@ -196,12 +196,14 @@ def tika_convert_files_to_dicts(
last_para = ''
for para in paras:
para = para.strip()
if not para: continue
if not para:
continue
# merge paragraphs to improve qa
# merge this paragraph if less than 10 characters or 2 words
# or this paragraph starts with a lower case and last paragraph does not end with a punctuation
if merge_short and len(para) < 10 or len(re.findall('\s+', para)) < 2 \
or merge_lowercase and para and para[0].islower() and last_para and last_para[-1] not in '.?!"\'\]\)':
if merge_short and len(para) < 10 or len(re.findall(r'\s+', para)) < 2 \
or merge_lowercase and para and para[0].islower() and last_para \
and last_para[-1] not in r'.?!"\'\]\)':
last_para += ' ' + para
else:
if last_para:

View File

@ -338,6 +338,7 @@ class FARMReader(BaseReader):
inputs.append(cur)
# get answers from QA model
# TODO: Need fix in FARM's `to_dict` function of `QAInput` class
predictions = self.inferencer.inference_from_objects(
objects=inputs, return_json=False, multiprocessing_chunksize=1
)

View File

@ -5,13 +5,9 @@ import numpy as np
from pathlib import Path
from tqdm import tqdm
from farm.infer import Inferencer
from haystack.document_store.base import BaseDocumentStore
from haystack import Document
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.base import BaseRetriever
from haystack.retriever.sparse import logger
from farm.infer import Inferencer
from farm.modeling.tokenization import Tokenizer
@ -374,6 +370,8 @@ class EmbeddingRetriever(BaseRetriever):
assert type(texts) == list, "Expecting a list of texts, i.e. create_embeddings(texts=['text1',...])"
if self.model_format == "farm" or self.model_format == "transformers":
# TODO: FARM's `sample_to_features_text` need to fix following warning -
# tokenization_utils.py:460: FutureWarning: `is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.
emb = self.embedding_model.inference_from_dicts(dicts=[{"text": t} for t in texts]) # type: ignore
emb = [(r["vec"]) for r in emb]
elif self.model_format == "sentence_transformers":