feat: Add CsvTextConverter (#3587)

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fixes #3550, allow user to build full FAQ using YAML pipeline description and with CSV import and indexing.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy and pylint.

* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline

Fix linter issues mypy.

* implement proposal's feedback

* tidy up for merge

* use BaseConverter

* use BaseConverter

* pylint

* black

* Revert "black"

This reverts commit e1c45cb1848408bd52a630328750cb67c8eb7110.

* black

* add check for column names

* add check for column names

* add tests

* fix tests

* address lists of paths

* typo

* remove duplicate line

Co-authored-by: ZanSara <sarazanzo94@gmail.com>
This commit is contained in:
Benjamin BERNARD 2023-01-23 15:56:36 +01:00 committed by GitHub
parent 94f660c56f
commit eed009eddb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 230 additions and 0 deletions

View File

@ -19,6 +19,7 @@ from haystack.nodes.file_converter import (
TextConverter,
AzureConverter,
ParsrConverter,
CsvTextConverter,
)
from haystack.nodes.label_generator import PseudoLabelGenerator
from haystack.nodes.other import Docs2Answers, JoinDocuments, RouteDocuments, JoinAnswers, DocumentMerger

View File

@ -2,6 +2,7 @@ from haystack.nodes.file_converter.base import BaseConverter
from haystack.utils.import_utils import safe_import
from haystack.nodes.file_converter.csv import CsvTextConverter
from haystack.nodes.file_converter.docx import DocxToTextConverter
from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
from haystack.nodes.file_converter.txt import TextConverter

View File

@ -0,0 +1,61 @@
from typing import Union, List, Optional, Any, Dict
import logging
from pathlib import Path
import pandas as pd
from haystack import Document
from haystack.nodes.file_converter import BaseConverter
logger = logging.getLogger(__name__)
class CsvTextConverter(BaseConverter):
"""
Converts Question & Answers CSV files to text Documents.
"""
outgoing_edges = 1
def convert(
self,
file_path: Union[Path, List[Path], str, List[str], List[Union[Path, str]]],
meta: Optional[Dict[str, Any]],
remove_numeric_tables: Optional[bool] = None,
valid_languages: Optional[List[str]] = None,
encoding: Optional[str] = "UTF-8",
id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
"""
Load CVS file and convert it to documents.
:param file_path: Path to a CSV file containing two columns.
The first will be interpreted as a question, the second as content.
:returns: List of document, 1 document per line in the CSV.
"""
if not isinstance(file_path, list):
file_path = [file_path]
docs: List[Document] = []
for path in file_path:
df = pd.read_csv(path, encoding=encoding)
if len(df.columns) != 2 or df.columns[0] != "question" or df.columns[1] != "answer":
raise ValueError("The CSV must contain two columns named 'question' and 'answer'")
df.fillna(value="", inplace=True)
df["question"] = df["question"].apply(lambda x: x.strip())
df = df.rename(columns={"question": "content"})
docs_dicts = df.to_dict(orient="records")
for dictionary in docs_dicts:
if meta:
dictionary["meta"] = meta
if id_hash_keys:
dictionary["id_hash_keys"] = id_hash_keys
docs.append(Document.from_dict(dictionary))
return docs

View File

@ -0,0 +1,40 @@
# To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
version: ignore
components: # define all the building-blocks for Pipeline
- name: DocumentStore
type: ElasticsearchDocumentStore
params:
host: localhost
embedding_field: question_emb
embedding_dim: 384
excluded_meta_data:
- question_emb
similarity: cosine
- name: Retriever
type: EmbeddingRetriever
params:
document_store: DocumentStore # params can reference other components defined in the YAML
embedding_model: sentence-transformers/all-MiniLM-L6-v2
scale_score: False
- name: Doc2Answers # custom-name for the component; helpful for visualization & debugging
type: Docs2Answers # Haystack Class name for the component
- name: CsvTextConverter
type: CsvTextConverter
pipelines:
- name: query # a sample extractive-qa Pipeline
nodes:
- name: Retriever
inputs: [Query]
- name: Doc2Answers
inputs: [Retriever]
- name: indexing
nodes:
- name: CsvTextConverter
inputs: [File]
- name: Retriever
inputs: [ CsvTextConverter ]
- name: DocumentStore
inputs: [ Retriever ]

View File

@ -1,10 +1,14 @@
from typing import List
import os
import sys
from pathlib import Path
import subprocess
import csv
import pytest
from haystack import Document
from haystack.nodes import (
MarkdownConverter,
DocxToTextConverter,
@ -14,6 +18,7 @@ from haystack.nodes import (
AzureConverter,
ParsrConverter,
TextConverter,
CsvTextConverter,
)
from ..conftest import SAMPLES_PATH
@ -265,3 +270,125 @@ def test_id_hash_keys_from_pipeline_params():
assert len(documents) == 2
assert len(unique_ids) == 2
def write_as_csv(data: List[List[str]], file_path: Path):
with open(file_path, "w") as f:
writer = csv.writer(f)
writer.writerows(data)
@pytest.mark.integration
def test_csv_to_document_with_qa_headers(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_headers.csv"
rows = [
["question", "answer"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
]
write_as_csv(rows, csv_path)
output, edge = node.run(file_paths=csv_path)
assert edge == "output_1"
assert "documents" in output
assert len(output["documents"]) == 1
doc = output["documents"][0]
assert isinstance(doc, Document)
assert doc.content == "What is Haystack ?"
assert doc.meta["answer"] == "Haystack is an NLP Framework to use transformers in your Applications."
@pytest.mark.integration
def test_csv_to_document_with_wrong_qa_headers(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [
["wrong", "headers"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
]
write_as_csv(rows, csv_path)
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
node.run(file_paths=csv_path)
@pytest.mark.integration
def test_csv_to_document_with_one_wrong_qa_headers(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [
["wrong", "answers"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
]
write_as_csv(rows, csv_path)
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
node.run(file_paths=csv_path)
@pytest.mark.integration
def test_csv_to_document_with_another_wrong_qa_headers(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [
["question", "wrong"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
]
write_as_csv(rows, csv_path)
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
node.run(file_paths=csv_path)
@pytest.mark.integration
def test_csv_to_document_with_one_column(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [["question"], ["What is Haystack ?"]]
write_as_csv(rows, csv_path)
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
node.run(file_paths=csv_path)
@pytest.mark.integration
def test_csv_to_document_with_three_columns(tmp_path):
node = CsvTextConverter()
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
rows = [
["question", "answer", "notes"],
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications.", "verified"],
]
write_as_csv(rows, csv_path)
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
node.run(file_paths=csv_path)
@pytest.mark.integration
def test_csv_to_document_many_files(tmp_path):
csv_paths = []
for i in range(5):
node = CsvTextConverter()
csv_path = tmp_path / f"{i}_csv_qa_with_headers.csv"
csv_paths.append(csv_path)
rows = [
["question", "answer"],
[
f"{i}. What is Haystack ?",
f"{i}. Haystack is an NLP Framework to use transformers in your Applications.",
],
]
write_as_csv(rows, csv_path)
output, edge = node.run(file_paths=csv_paths)
assert edge == "output_1"
assert "documents" in output
assert len(output["documents"]) == 5
for i in range(5):
doc = output["documents"][i]
assert isinstance(doc, Document)
assert doc.content == f"{i}. What is Haystack ?"
assert doc.meta["answer"] == f"{i}. Haystack is an NLP Framework to use transformers in your Applications."