diff --git a/haystack/nodes/__init__.py b/haystack/nodes/__init__.py index ec3e6126d..facbb3ce7 100644 --- a/haystack/nodes/__init__.py +++ b/haystack/nodes/__init__.py @@ -19,6 +19,7 @@ from haystack.nodes.file_converter import ( TextConverter, AzureConverter, ParsrConverter, + CsvTextConverter, ) from haystack.nodes.label_generator import PseudoLabelGenerator from haystack.nodes.other import Docs2Answers, JoinDocuments, RouteDocuments, JoinAnswers, DocumentMerger diff --git a/haystack/nodes/file_converter/__init__.py b/haystack/nodes/file_converter/__init__.py index db0b714fc..74f9fbb4f 100644 --- a/haystack/nodes/file_converter/__init__.py +++ b/haystack/nodes/file_converter/__init__.py @@ -2,6 +2,7 @@ from haystack.nodes.file_converter.base import BaseConverter from haystack.utils.import_utils import safe_import +from haystack.nodes.file_converter.csv import CsvTextConverter from haystack.nodes.file_converter.docx import DocxToTextConverter from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser from haystack.nodes.file_converter.txt import TextConverter diff --git a/haystack/nodes/file_converter/csv.py b/haystack/nodes/file_converter/csv.py new file mode 100644 index 000000000..f677f240b --- /dev/null +++ b/haystack/nodes/file_converter/csv.py @@ -0,0 +1,61 @@ +from typing import Union, List, Optional, Any, Dict + +import logging +from pathlib import Path + +import pandas as pd + +from haystack import Document +from haystack.nodes.file_converter import BaseConverter + + +logger = logging.getLogger(__name__) + + +class CsvTextConverter(BaseConverter): + """ + Converts Question & Answers CSV files to text Documents. + """ + + outgoing_edges = 1 + + def convert( + self, + file_path: Union[Path, List[Path], str, List[str], List[Union[Path, str]]], + meta: Optional[Dict[str, Any]], + remove_numeric_tables: Optional[bool] = None, + valid_languages: Optional[List[str]] = None, + encoding: Optional[str] = "UTF-8", + id_hash_keys: Optional[List[str]] = None, + ) -> List[Document]: + """ + Load CVS file and convert it to documents. + + :param file_path: Path to a CSV file containing two columns. + The first will be interpreted as a question, the second as content. + :returns: List of document, 1 document per line in the CSV. + """ + if not isinstance(file_path, list): + file_path = [file_path] + + docs: List[Document] = [] + for path in file_path: + df = pd.read_csv(path, encoding=encoding) + + if len(df.columns) != 2 or df.columns[0] != "question" or df.columns[1] != "answer": + raise ValueError("The CSV must contain two columns named 'question' and 'answer'") + + df.fillna(value="", inplace=True) + df["question"] = df["question"].apply(lambda x: x.strip()) + + df = df.rename(columns={"question": "content"}) + docs_dicts = df.to_dict(orient="records") + + for dictionary in docs_dicts: + if meta: + dictionary["meta"] = meta + if id_hash_keys: + dictionary["id_hash_keys"] = id_hash_keys + docs.append(Document.from_dict(dictionary)) + + return docs diff --git a/rest_api/rest_api/pipeline/pipelines_faq.haystack-pipeline.yml b/rest_api/rest_api/pipeline/pipelines_faq.haystack-pipeline.yml new file mode 100644 index 000000000..3473660ff --- /dev/null +++ b/rest_api/rest_api/pipeline/pipelines_faq.haystack-pipeline.yml @@ -0,0 +1,40 @@ +# To allow your IDE to autocomplete and validate your YAML pipelines, name them as .haystack-pipeline.yml + +version: ignore + +components: # define all the building-blocks for Pipeline + - name: DocumentStore + type: ElasticsearchDocumentStore + params: + host: localhost + embedding_field: question_emb + embedding_dim: 384 + excluded_meta_data: + - question_emb + similarity: cosine + - name: Retriever + type: EmbeddingRetriever + params: + document_store: DocumentStore # params can reference other components defined in the YAML + embedding_model: sentence-transformers/all-MiniLM-L6-v2 + scale_score: False + - name: Doc2Answers # custom-name for the component; helpful for visualization & debugging + type: Docs2Answers # Haystack Class name for the component + - name: CsvTextConverter + type: CsvTextConverter + +pipelines: + - name: query # a sample extractive-qa Pipeline + nodes: + - name: Retriever + inputs: [Query] + - name: Doc2Answers + inputs: [Retriever] + - name: indexing + nodes: + - name: CsvTextConverter + inputs: [File] + - name: Retriever + inputs: [ CsvTextConverter ] + - name: DocumentStore + inputs: [ Retriever ] diff --git a/test/nodes/test_file_converter.py b/test/nodes/test_file_converter.py index d515287b1..cdcffa10b 100644 --- a/test/nodes/test_file_converter.py +++ b/test/nodes/test_file_converter.py @@ -1,10 +1,14 @@ +from typing import List + import os import sys from pathlib import Path import subprocess +import csv import pytest +from haystack import Document from haystack.nodes import ( MarkdownConverter, DocxToTextConverter, @@ -14,6 +18,7 @@ from haystack.nodes import ( AzureConverter, ParsrConverter, TextConverter, + CsvTextConverter, ) from ..conftest import SAMPLES_PATH @@ -265,3 +270,125 @@ def test_id_hash_keys_from_pipeline_params(): assert len(documents) == 2 assert len(unique_ids) == 2 + + +def write_as_csv(data: List[List[str]], file_path: Path): + with open(file_path, "w") as f: + writer = csv.writer(f) + writer.writerows(data) + + +@pytest.mark.integration +def test_csv_to_document_with_qa_headers(tmp_path): + node = CsvTextConverter() + csv_path = tmp_path / "csv_qa_with_headers.csv" + rows = [ + ["question", "answer"], + ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."], + ] + write_as_csv(rows, csv_path) + + output, edge = node.run(file_paths=csv_path) + assert edge == "output_1" + assert "documents" in output + assert len(output["documents"]) == 1 + + doc = output["documents"][0] + assert isinstance(doc, Document) + assert doc.content == "What is Haystack ?" + assert doc.meta["answer"] == "Haystack is an NLP Framework to use transformers in your Applications." + + +@pytest.mark.integration +def test_csv_to_document_with_wrong_qa_headers(tmp_path): + node = CsvTextConverter() + csv_path = tmp_path / "csv_qa_with_wrong_headers.csv" + rows = [ + ["wrong", "headers"], + ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."], + ] + write_as_csv(rows, csv_path) + + with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"): + node.run(file_paths=csv_path) + + +@pytest.mark.integration +def test_csv_to_document_with_one_wrong_qa_headers(tmp_path): + node = CsvTextConverter() + csv_path = tmp_path / "csv_qa_with_wrong_headers.csv" + rows = [ + ["wrong", "answers"], + ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."], + ] + write_as_csv(rows, csv_path) + + with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"): + node.run(file_paths=csv_path) + + +@pytest.mark.integration +def test_csv_to_document_with_another_wrong_qa_headers(tmp_path): + node = CsvTextConverter() + csv_path = tmp_path / "csv_qa_with_wrong_headers.csv" + rows = [ + ["question", "wrong"], + ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."], + ] + write_as_csv(rows, csv_path) + + with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"): + node.run(file_paths=csv_path) + + +@pytest.mark.integration +def test_csv_to_document_with_one_column(tmp_path): + node = CsvTextConverter() + csv_path = tmp_path / "csv_qa_with_wrong_headers.csv" + rows = [["question"], ["What is Haystack ?"]] + write_as_csv(rows, csv_path) + + with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"): + node.run(file_paths=csv_path) + + +@pytest.mark.integration +def test_csv_to_document_with_three_columns(tmp_path): + node = CsvTextConverter() + csv_path = tmp_path / "csv_qa_with_wrong_headers.csv" + rows = [ + ["question", "answer", "notes"], + ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications.", "verified"], + ] + write_as_csv(rows, csv_path) + + with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"): + node.run(file_paths=csv_path) + + +@pytest.mark.integration +def test_csv_to_document_many_files(tmp_path): + csv_paths = [] + for i in range(5): + node = CsvTextConverter() + csv_path = tmp_path / f"{i}_csv_qa_with_headers.csv" + csv_paths.append(csv_path) + rows = [ + ["question", "answer"], + [ + f"{i}. What is Haystack ?", + f"{i}. Haystack is an NLP Framework to use transformers in your Applications.", + ], + ] + write_as_csv(rows, csv_path) + + output, edge = node.run(file_paths=csv_paths) + assert edge == "output_1" + assert "documents" in output + assert len(output["documents"]) == 5 + + for i in range(5): + doc = output["documents"][i] + assert isinstance(doc, Document) + assert doc.content == f"{i}. What is Haystack ?" + assert doc.meta["answer"] == f"{i}. Haystack is an NLP Framework to use transformers in your Applications."