mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-05 19:47:45 +00:00
feat: Add CsvTextConverter (#3587)
* feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline Fixes #3550, allow user to build full FAQ using YAML pipeline description and with CSV import and indexing. * feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline Fix linter issues mypy and pylint. * feat: Add Csv2Documents, EmbedDocuments nodes and FAQ indexing pipeline Fix linter issues mypy. * implement proposal's feedback * tidy up for merge * use BaseConverter * use BaseConverter * pylint * black * Revert "black" This reverts commit e1c45cb1848408bd52a630328750cb67c8eb7110. * black * add check for column names * add check for column names * add tests * fix tests * address lists of paths * typo * remove duplicate line Co-authored-by: ZanSara <sarazanzo94@gmail.com>
This commit is contained in:
parent
94f660c56f
commit
eed009eddb
@ -19,6 +19,7 @@ from haystack.nodes.file_converter import (
|
||||
TextConverter,
|
||||
AzureConverter,
|
||||
ParsrConverter,
|
||||
CsvTextConverter,
|
||||
)
|
||||
from haystack.nodes.label_generator import PseudoLabelGenerator
|
||||
from haystack.nodes.other import Docs2Answers, JoinDocuments, RouteDocuments, JoinAnswers, DocumentMerger
|
||||
|
||||
@ -2,6 +2,7 @@ from haystack.nodes.file_converter.base import BaseConverter
|
||||
|
||||
from haystack.utils.import_utils import safe_import
|
||||
|
||||
from haystack.nodes.file_converter.csv import CsvTextConverter
|
||||
from haystack.nodes.file_converter.docx import DocxToTextConverter
|
||||
from haystack.nodes.file_converter.tika import TikaConverter, TikaXHTMLParser
|
||||
from haystack.nodes.file_converter.txt import TextConverter
|
||||
|
||||
61
haystack/nodes/file_converter/csv.py
Normal file
61
haystack/nodes/file_converter/csv.py
Normal file
@ -0,0 +1,61 @@
|
||||
from typing import Union, List, Optional, Any, Dict
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from haystack import Document
|
||||
from haystack.nodes.file_converter import BaseConverter
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class CsvTextConverter(BaseConverter):
|
||||
"""
|
||||
Converts Question & Answers CSV files to text Documents.
|
||||
"""
|
||||
|
||||
outgoing_edges = 1
|
||||
|
||||
def convert(
|
||||
self,
|
||||
file_path: Union[Path, List[Path], str, List[str], List[Union[Path, str]]],
|
||||
meta: Optional[Dict[str, Any]],
|
||||
remove_numeric_tables: Optional[bool] = None,
|
||||
valid_languages: Optional[List[str]] = None,
|
||||
encoding: Optional[str] = "UTF-8",
|
||||
id_hash_keys: Optional[List[str]] = None,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
Load CVS file and convert it to documents.
|
||||
|
||||
:param file_path: Path to a CSV file containing two columns.
|
||||
The first will be interpreted as a question, the second as content.
|
||||
:returns: List of document, 1 document per line in the CSV.
|
||||
"""
|
||||
if not isinstance(file_path, list):
|
||||
file_path = [file_path]
|
||||
|
||||
docs: List[Document] = []
|
||||
for path in file_path:
|
||||
df = pd.read_csv(path, encoding=encoding)
|
||||
|
||||
if len(df.columns) != 2 or df.columns[0] != "question" or df.columns[1] != "answer":
|
||||
raise ValueError("The CSV must contain two columns named 'question' and 'answer'")
|
||||
|
||||
df.fillna(value="", inplace=True)
|
||||
df["question"] = df["question"].apply(lambda x: x.strip())
|
||||
|
||||
df = df.rename(columns={"question": "content"})
|
||||
docs_dicts = df.to_dict(orient="records")
|
||||
|
||||
for dictionary in docs_dicts:
|
||||
if meta:
|
||||
dictionary["meta"] = meta
|
||||
if id_hash_keys:
|
||||
dictionary["id_hash_keys"] = id_hash_keys
|
||||
docs.append(Document.from_dict(dictionary))
|
||||
|
||||
return docs
|
||||
@ -0,0 +1,40 @@
|
||||
# To allow your IDE to autocomplete and validate your YAML pipelines, name them as <name of your choice>.haystack-pipeline.yml
|
||||
|
||||
version: ignore
|
||||
|
||||
components: # define all the building-blocks for Pipeline
|
||||
- name: DocumentStore
|
||||
type: ElasticsearchDocumentStore
|
||||
params:
|
||||
host: localhost
|
||||
embedding_field: question_emb
|
||||
embedding_dim: 384
|
||||
excluded_meta_data:
|
||||
- question_emb
|
||||
similarity: cosine
|
||||
- name: Retriever
|
||||
type: EmbeddingRetriever
|
||||
params:
|
||||
document_store: DocumentStore # params can reference other components defined in the YAML
|
||||
embedding_model: sentence-transformers/all-MiniLM-L6-v2
|
||||
scale_score: False
|
||||
- name: Doc2Answers # custom-name for the component; helpful for visualization & debugging
|
||||
type: Docs2Answers # Haystack Class name for the component
|
||||
- name: CsvTextConverter
|
||||
type: CsvTextConverter
|
||||
|
||||
pipelines:
|
||||
- name: query # a sample extractive-qa Pipeline
|
||||
nodes:
|
||||
- name: Retriever
|
||||
inputs: [Query]
|
||||
- name: Doc2Answers
|
||||
inputs: [Retriever]
|
||||
- name: indexing
|
||||
nodes:
|
||||
- name: CsvTextConverter
|
||||
inputs: [File]
|
||||
- name: Retriever
|
||||
inputs: [ CsvTextConverter ]
|
||||
- name: DocumentStore
|
||||
inputs: [ Retriever ]
|
||||
@ -1,10 +1,14 @@
|
||||
from typing import List
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import subprocess
|
||||
import csv
|
||||
|
||||
import pytest
|
||||
|
||||
from haystack import Document
|
||||
from haystack.nodes import (
|
||||
MarkdownConverter,
|
||||
DocxToTextConverter,
|
||||
@ -14,6 +18,7 @@ from haystack.nodes import (
|
||||
AzureConverter,
|
||||
ParsrConverter,
|
||||
TextConverter,
|
||||
CsvTextConverter,
|
||||
)
|
||||
|
||||
from ..conftest import SAMPLES_PATH
|
||||
@ -265,3 +270,125 @@ def test_id_hash_keys_from_pipeline_params():
|
||||
|
||||
assert len(documents) == 2
|
||||
assert len(unique_ids) == 2
|
||||
|
||||
|
||||
def write_as_csv(data: List[List[str]], file_path: Path):
|
||||
with open(file_path, "w") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerows(data)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_csv_to_document_with_qa_headers(tmp_path):
|
||||
node = CsvTextConverter()
|
||||
csv_path = tmp_path / "csv_qa_with_headers.csv"
|
||||
rows = [
|
||||
["question", "answer"],
|
||||
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
|
||||
]
|
||||
write_as_csv(rows, csv_path)
|
||||
|
||||
output, edge = node.run(file_paths=csv_path)
|
||||
assert edge == "output_1"
|
||||
assert "documents" in output
|
||||
assert len(output["documents"]) == 1
|
||||
|
||||
doc = output["documents"][0]
|
||||
assert isinstance(doc, Document)
|
||||
assert doc.content == "What is Haystack ?"
|
||||
assert doc.meta["answer"] == "Haystack is an NLP Framework to use transformers in your Applications."
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_csv_to_document_with_wrong_qa_headers(tmp_path):
|
||||
node = CsvTextConverter()
|
||||
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
|
||||
rows = [
|
||||
["wrong", "headers"],
|
||||
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
|
||||
]
|
||||
write_as_csv(rows, csv_path)
|
||||
|
||||
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
|
||||
node.run(file_paths=csv_path)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_csv_to_document_with_one_wrong_qa_headers(tmp_path):
|
||||
node = CsvTextConverter()
|
||||
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
|
||||
rows = [
|
||||
["wrong", "answers"],
|
||||
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
|
||||
]
|
||||
write_as_csv(rows, csv_path)
|
||||
|
||||
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
|
||||
node.run(file_paths=csv_path)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_csv_to_document_with_another_wrong_qa_headers(tmp_path):
|
||||
node = CsvTextConverter()
|
||||
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
|
||||
rows = [
|
||||
["question", "wrong"],
|
||||
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications."],
|
||||
]
|
||||
write_as_csv(rows, csv_path)
|
||||
|
||||
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
|
||||
node.run(file_paths=csv_path)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_csv_to_document_with_one_column(tmp_path):
|
||||
node = CsvTextConverter()
|
||||
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
|
||||
rows = [["question"], ["What is Haystack ?"]]
|
||||
write_as_csv(rows, csv_path)
|
||||
|
||||
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
|
||||
node.run(file_paths=csv_path)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_csv_to_document_with_three_columns(tmp_path):
|
||||
node = CsvTextConverter()
|
||||
csv_path = tmp_path / "csv_qa_with_wrong_headers.csv"
|
||||
rows = [
|
||||
["question", "answer", "notes"],
|
||||
["What is Haystack ?", "Haystack is an NLP Framework to use transformers in your Applications.", "verified"],
|
||||
]
|
||||
write_as_csv(rows, csv_path)
|
||||
|
||||
with pytest.raises(ValueError, match="The CSV must contain two columns named 'question' and 'answer'"):
|
||||
node.run(file_paths=csv_path)
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
def test_csv_to_document_many_files(tmp_path):
|
||||
csv_paths = []
|
||||
for i in range(5):
|
||||
node = CsvTextConverter()
|
||||
csv_path = tmp_path / f"{i}_csv_qa_with_headers.csv"
|
||||
csv_paths.append(csv_path)
|
||||
rows = [
|
||||
["question", "answer"],
|
||||
[
|
||||
f"{i}. What is Haystack ?",
|
||||
f"{i}. Haystack is an NLP Framework to use transformers in your Applications.",
|
||||
],
|
||||
]
|
||||
write_as_csv(rows, csv_path)
|
||||
|
||||
output, edge = node.run(file_paths=csv_paths)
|
||||
assert edge == "output_1"
|
||||
assert "documents" in output
|
||||
assert len(output["documents"]) == 5
|
||||
|
||||
for i in range(5):
|
||||
doc = output["documents"][i]
|
||||
assert isinstance(doc, Document)
|
||||
assert doc.content == f"{i}. What is Haystack ?"
|
||||
assert doc.meta["answer"] == f"{i}. Haystack is an NLP Framework to use transformers in your Applications."
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user