feat: Add CsvTextConverter (#3587)

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline Fixes #3550, allow user to build full FAQ using YAML pipeline description and with CSV import and indexing. * feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline Fix linter issues mypy and pylint. * feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline Fix linter issues mypy. * implement proposal's feedback * tidy up for merge * use BaseConverter * use BaseConverter * pylint * black * Revert "black" This reverts commit e1c45cb1848408bd52a630328750cb67c8eb7110. * black * add check for column names * add check for column names * add tests * fix tests * address lists of paths * typo * remove duplicate line Co-authored-by: ZanSara <sarazanzo94@gmail.com>
2026-01-05 19:47:45 +00:00 · 2023-01-23 15:56:36 +01:00 · 2023-01-23 15:56:36 +01:00 · eed009eddb
commit eed009eddb
parent 94f660c56f
5 changed files with 230 additions and 0 deletions
--- a/haystack/nodes/init.py
+++ b/haystack/nodes/init.py
@ -19,6 +19,7 @@ from haystack.nodes.file_converter import (
    TextConverter,
    AzureConverter,
    ParsrConverter,
+    CsvTextConverter,
 )
 from haystack.nodes.label_generator import PseudoLabelGenerator
 from haystack.nodes.other import Docs2Answers, JoinDocuments, RouteDocuments, JoinAnswers, DocumentMerger
--- a/haystack/nodes/file_converter/init.py
+++ b/haystack/nodes/file_converter/init.py
@ -2,6 +2,7 @@ from haystack.nodes.file_converter.base import BaseConverter

 from haystack.utils.import_utils import safe_import

+from haystack.nodes.file_converter.csv import CsvTextConverter
 from haystack.nodes.file_converter.docx import DocxToTextConverter
 from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
 from haystack.nodes.file_converter.txt import TextConverter
--- a/haystack/nodes/file_converter/csv.py
+++ b/haystack/nodes/file_converter/csv.py
@ -0,0 +1,61 @@
+from typing import Union, List, Optional, Any, Dict
+
+import logging
+from pathlib import Path
+
+import pandas as pd
+
+from haystack import Document
+from haystack.nodes.file_converter import BaseConverter
+
+
+logger = logging.getLogger(__name__)
+
+
+class CsvTextConverter(BaseConverter):
+    """
+    Converts Question & Answers CSV files to text Documents.
+    """
+
+    outgoing_edges = 1
+
+    def convert(
+        self,
+        file_path: Union[Path, List[Path], str, List[str], List[Union[Path, str]]],
+        meta: Optional[Dict[str, Any]],
+        remove_numeric_tables: Optional[bool] = None,
+        valid_languages: Optional[List[str]] = None,
+        encoding: Optional[str] = "UTF-8",
+        id_hash_keys: Optional[List[str]] = None,
+    ) -> List[Document]:
+        """
+        Load CVS file and convert it to documents.
+
+        :param file_path: Path to a CSV file containing two columns.
+            The first will be interpreted as a question, the second as content.
+        :returns: List of document, 1 document per line in the CSV.
+        """
+        if not isinstance(file_path, list):
+            file_path = [file_path]
+
+        docs: List[Document] = []
+        for path in file_path:
+            df = pd.read_csv(path, encoding=encoding)
+
+            if len(df.columns) != 2 or df.columns[0] != "question" or df.columns[1] != "answer":
+                raise ValueError("The CSV must contain two columns named 'question' and 'answer'")
+
+            df.fillna(value="", inplace=True)
+            df["question"] = df["question"].apply(lambda x: x.strip())
+
+            df = df.rename(columns={"question": "content"})
+            docs_dicts = df.to_dict(orient="records")
+
+            for dictionary in docs_dicts:
+                if meta:
+                    dictionary["meta"] = meta
+                if id_hash_keys:
+                    dictionary["id_hash_keys"] = id_hash_keys
+                docs.append(Document.from_dict(dictionary))
+
+        return docs
--- a/rest_api/rest_api/pipeline/pipelines_faq.haystack-pipeline.yml
+++ b/rest_api/rest_api/pipeline/pipelines_faq.haystack-pipeline.yml
@ -0,0 +1,40 @@
+# To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
+
+version: ignore
+
+components:    # define all the building-blocks for Pipeline
+  - name: DocumentStore
+    type: ElasticsearchDocumentStore
+    params:
+      host: localhost
+      embedding_field: question_emb
+      embedding_dim: 384
+      excluded_meta_data:
+        - question_emb
+      similarity: cosine
+  - name: Retriever
+    type: EmbeddingRetriever
+    params:
+      document_store: DocumentStore    # params can reference other components defined in the YAML
+      embedding_model: sentence-transformers/all-MiniLM-L6-v2
+      scale_score: False
+  - name: Doc2Answers       # custom-name for the component; helpful for visualization & debugging
+    type: Docs2Answers    # Haystack Class name for the component
+  - name: CsvTextConverter
+    type: CsvTextConverter
+
+pipelines:
+  - name: query    # a sample extractive-qa Pipeline
+    nodes:
+      - name: Retriever
+        inputs: [Query]
+      - name: Doc2Answers
+        inputs: [Retriever]
+  - name: indexing
+    nodes:
+      - name: CsvTextConverter
+        inputs: [File]
+      - name: Retriever
+        inputs: [ CsvTextConverter ]
+      - name: DocumentStore
+        inputs: [ Retriever ]
--- a/test/nodes/test_file_converter.py
+++ b/test/nodes/test_file_converter.py
@ -1,10 +1,14 @@
+from typing import List
+
 import os
 import sys
 from pathlib import Path
 import subprocess
+import csv

 import pytest

+from haystack import Document
 from haystack.nodes import (
    MarkdownConverter,
    DocxToTextConverter,
@ -14,6 +18,7 @@ from haystack.nodes import (
    AzureConverter,
    ParsrConverter,
    TextConverter,
+    CsvTextConverter,
 )

 from ..conftest import SAMPLES_PATH
@ -265,3 +270,125 @@ def test_id_hash_keys_from_pipeline_params():

    assert len(documents) == 2
    assert len(unique_ids) == 2
+
+
+def write_as_csv(data: List[List[str]], file_path: Path):
+    with open(file_path, "w") as f:
+        writer = csv.writer(f)
+        writer.writerows(data)
+
+
+@pytest.mark.integration
+def test_csv_to_document_with_qa_headers(tmp_path):
+    node = CsvTextConverter()
+    csv_path = tmp_path / "csv_qa_with_headers.csv"
+    rows = [
+        ["question", "answer"],
+        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
+    ]
+    write_as_csv(rows, csv_path)
+
+    output, edge = node.run(file_paths=csv_path)
+    assert edge == "output_1"
+    assert "documents" in output
+    assert len(output["documents"]) == 1
+
+    doc = output["documents"][0]
+    assert isinstance(doc, Document)
+    assert doc.content == "What is Haystack ?"
+    assert doc.meta["answer"] == "Haystack is an NLP Framework to use transformers in your Applications."
+
+
+@pytest.mark.integration
+def test_csv_to_document_with_wrong_qa_headers(tmp_path):
+    node = CsvTextConverter()
+    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
+    rows = [
+        ["wrong", "headers"],
+        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
+    ]
+    write_as_csv(rows, csv_path)
+
+    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
+        node.run(file_paths=csv_path)
+
+
+@pytest.mark.integration
+def test_csv_to_document_with_one_wrong_qa_headers(tmp_path):
+    node = CsvTextConverter()
+    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
+    rows = [
+        ["wrong", "answers"],
+        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
+    ]
+    write_as_csv(rows, csv_path)
+
+    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
+        node.run(file_paths=csv_path)
+
+
+@pytest.mark.integration
+def test_csv_to_document_with_another_wrong_qa_headers(tmp_path):
+    node = CsvTextConverter()
+    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
+    rows = [
+        ["question", "wrong"],
+        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
+    ]
+    write_as_csv(rows, csv_path)
+
+    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
+        node.run(file_paths=csv_path)
+
+
+@pytest.mark.integration
+def test_csv_to_document_with_one_column(tmp_path):
+    node = CsvTextConverter()
+    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
+    rows = [["question"], ["What is Haystack ?"]]
+    write_as_csv(rows, csv_path)
+
+    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
+        node.run(file_paths=csv_path)
+
+
+@pytest.mark.integration
+def test_csv_to_document_with_three_columns(tmp_path):
+    node = CsvTextConverter()
+    csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
+    rows = [
+        ["question", "answer", "notes"],
+        ["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications.", "verified"],
+    ]
+    write_as_csv(rows, csv_path)
+
+    with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
+        node.run(file_paths=csv_path)
+
+
+@pytest.mark.integration
+def test_csv_to_document_many_files(tmp_path):
+    csv_paths = []
+    for i in range(5):
+        node = CsvTextConverter()
+        csv_path = tmp_path / f"{i}_csv_qa_with_headers.csv"
+        csv_paths.append(csv_path)
+        rows = [
+            ["question", "answer"],
+            [
+                f"{i}. What is Haystack ?",
+                f"{i}. Haystack is an NLP Framework to use transformers in your Applications.",
+            ],
+        ]
+        write_as_csv(rows, csv_path)
+
+    output, edge = node.run(file_paths=csv_paths)
+    assert edge == "output_1"
+    assert "documents" in output
+    assert len(output["documents"]) == 5
+
+    for i in range(5):
+        doc = output["documents"][i]
+        assert isinstance(doc, Document)
+        assert doc.content == f"{i}. What is Haystack ?"
+        assert doc.meta["answer"] == f"{i}. Haystack is an NLP Framework to use transformers in your Applications."