Add support for Dense Retrievers in REST API Indexing Pipeline (#1430)

This commit is contained in:
oryx1729 2021-09-10 11:53:32 +02:00 committed by GitHub
parent 9dd7c74f4f
commit 1f859694f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 76 additions and 10 deletions

View File

@ -17,10 +17,24 @@ logger = logging.getLogger(__name__)
router = APIRouter()
try:
INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
_, pipeline_config, definitions = Pipeline._read_yaml(
path=Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME, overwrite_with_env_variables=True
)
# Since each instance of FAISSDocumentStore creates an in-memory FAISS index, the Indexing & Query Pipelines would
# end up with different indices. The check below prevents creation of Indexing Pipelines with FAISSDocumentStore.
is_faiss_present = False
for node in pipeline_config["nodes"]:
if definitions[node["name"]]["type"] == "FAISSDocumentStore":
is_faiss_present = True
break
if is_faiss_present:
logger.warning("Indexing Pipeline with FAISSDocumentStore is not supported with the REST APIs.")
INDEXING_PIPELINE = None
else:
INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME)
except KeyError:
INDEXING_PIPELINE = None
logger.info("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.")
logger.warning("Indexing Pipeline not found in the YAML configuration. File Upload API will not be available.")
os.makedirs(FILE_UPLOAD_PATH, exist_ok=True) # create directory for uploading files

View File

@ -1,14 +1,14 @@
version: '0.7'
version: '0.9'
components: # define all the building-blocks for Pipeline
- name: ElasticsearchDocumentStore
- name: DocumentStore
type: ElasticsearchDocumentStore
params:
host: localhost
- name: ESRetriever
- name: Retriever
type: ElasticsearchRetriever
params:
document_store: ElasticsearchDocumentStore # params can reference other components defined in the YAML
document_store: DocumentStore # params can reference other components defined in the YAML
top_k: 5
- name: Reader # custom-name for the component; helpful for visualization & debugging
type: FARMReader # Haystack Class name for the component
@ -30,11 +30,10 @@ pipelines:
- name: query # a sample extractive-qa Pipeline
type: Query
nodes:
- name: ESRetriever
- name: Retriever
inputs: [Query]
- name: Reader
inputs: [ESRetriever]
inputs: [Retriever]
- name: indexing
type: Indexing
nodes:
@ -46,5 +45,7 @@ pipelines:
inputs: [FileTypeClassifier.output_2]
- name: Preprocessor
inputs: [PDFFileConverter, TextFileConverter]
- name: ElasticsearchDocumentStore
- name: Retriever
inputs: [Preprocessor]
- name: DocumentStore
inputs: [Retriever]

View File

@ -0,0 +1,51 @@
version: '0.9'
components: # define all the building-blocks for Pipeline
- name: DocumentStore
type: ElasticsearchDocumentStore # consider using MilvusDocumentStore or WeaviateDocumentStore for scaling to large number of documents
params:
host: localhost
- name: Retriever
type: DensePassageRetriever
params:
document_store: DocumentStore # params can reference other components defined in the YAML
top_k: 5
- name: Reader # custom-name for the component; helpful for visualization & debugging
type: FARMReader # Haystack Class name for the component
params:
model_name_or_path: deepset/roberta-base-squad2
- name: TextFileConverter
type: TextConverter
- name: PDFFileConverter
type: PDFToTextConverter
- name: Preprocessor
type: PreProcessor
params:
split_by: word
split_length: 1000
- name: FileTypeClassifier
type: FileTypeClassifier
pipelines:
- name: query # a sample extractive-qa Pipeline
type: Query
nodes:
- name: Retriever
inputs: [Query]
- name: Reader
inputs: [Retriever]
- name: indexing
type: Indexing
nodes:
- name: FileTypeClassifier
inputs: [File]
- name: TextFileConverter
inputs: [FileTypeClassifier.output_1]
- name: PDFFileConverter
inputs: [FileTypeClassifier.output_2]
- name: Preprocessor
inputs: [PDFFileConverter, TextFileConverter]
- name: Retriever
inputs: [Preprocessor]
- name: DocumentStore
inputs: [Retriever]