mirror of
https://github.com/deepset-ai/haystack.git
synced 2025-07-27 19:00:35 +00:00

* first draft / notes on new primitives * wip label / feedback refactor * rename doc.text -> doc.content. add doc.content_type * add datatype for content * remove faq_question_field from ES and weaviate. rename text_field -> content_field in docstores. update tutorials for content field * update converters for . Add warning for empty * renam label.question -> label.query. Allow sorting of Answers. * WIP primitives * update ui/reader for new Answer format * Improve Label. First refactoring of MultiLabel. Adjust eval code * fixed workflow conflict with introducing new one (#1472) * Add latest docstring and tutorial changes * make add_eval_data() work again * fix reader formats. WIP fix _extract_docs_and_labels_from_dict * fix test reader * Add latest docstring and tutorial changes * fix another test case for reader * fix mypy in farm reader.eval() * fix mypy in farm reader.eval() * WIP ORM refactor * Add latest docstring and tutorial changes * fix mypy weaviate * make label and multilabel dataclasses * bump mypy env in CI to python 3.8 * WIP refactor Label ORM * WIP refactor Label ORM * simplify tests for individual doc stores * WIP refactoring markers of tests * test alternative approach for tests with existing parametrization * WIP refactor ORMs * fix skip logic of already parametrized tests * fix weaviate behaviour in tests - not parametrizing it in our general test cases. * Add latest docstring and tutorial changes * fix some tests * remove sql from document_store_types * fix markers for generator and pipeline test * remove inmemory marker * remove unneeded elasticsearch markers * add dataclasses-json dependency. adjust ORM to just store JSON repr * ignore type as dataclasses_json seems to miss functionality here * update readme and contributing.md * update contributing * adjust example * fix duplicate doc handling for custom index * Add latest docstring and tutorial changes * fix some ORM issues. fix get_all_labels_aggregated. * update drop flags where get_all_labels_aggregated() was used before * Add latest docstring and tutorial changes * add to_json(). add + fix tests * fix no_answer handling in label / multilabel * fix duplicate docs in memory doc store. change primary key for sql doc table * fix mypy issues * fix mypy issues * haystack/retriever/base.py * fix test_write_document_meta[elastic] * fix test_elasticsearch_custom_fields * fix test_labels[elastic] * fix crawler * fix converter * fix docx converter * fix preprocessor * fix test_utils * fix tfidf retriever. fix selection of docstore in tests with multiple fixtures / parameterizations * Add latest docstring and tutorial changes * fix crawler test. fix ocrconverter attribute * fix test_elasticsearch_custom_query * fix generator pipeline * fix ocr converter * fix ragenerator * Add latest docstring and tutorial changes * fix test_load_and_save_yaml for elasticsearch * fixes for pipeline tests * fix faq pipeline * fix pipeline tests * Add latest docstring and tutorial changes * fix weaviate * Add latest docstring and tutorial changes * trigger CI * satisfy mypy * Add latest docstring and tutorial changes * satisfy mypy * Add latest docstring and tutorial changes * trigger CI * fix question generation test * fix ray. fix Q-generation * fix translator test * satisfy mypy * wip refactor feedback rest api * fix rest api feedback endpoint * fix doc classifier * remove relation of Labels -> Docs in SQL ORM * fix faiss/milvus tests * fix doc classifier test * fix eval test * fixing eval issues * Add latest docstring and tutorial changes * fix mypy * WIP replace dataclasses-json with manual serialization * Add latest docstring and tutorial changes * revert to dataclass-json serialization for now. remove debug prints. * update docstrings * fix extractor. fix Answer Span init * fix api test * keep meta data of answers in reader.run() * fix meta handling * adress review feedback * Add latest docstring and tutorial changes * make document=None for open domain labels * add import * fix print utils * fix rest api * adress review feedback * Add latest docstring and tutorial changes * fix mypy Co-authored-by: Markus Paff <markuspaff.mp@gmail.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
114 lines
4.4 KiB
Python
114 lines
4.4 KiB
Python
import json
|
|
from pathlib import Path
|
|
from re import search
|
|
|
|
import pytest
|
|
from haystack.connector import Crawler
|
|
|
|
|
|
def test_crawler_url_none_exception(tmp_path):
|
|
tmp_dir = tmp_path / "crawled_files"
|
|
with pytest.raises(ValueError):
|
|
Crawler(tmp_dir).crawl()
|
|
|
|
|
|
def test_crawler_depth(tmp_path):
|
|
tmp_dir = tmp_path / "crawled_files"
|
|
_url = ["https://haystack.deepset.ai/overview/get-started"]
|
|
crawler = Crawler(output_dir=tmp_dir)
|
|
doc_path = crawler.crawl(urls=_url, crawler_depth=0)
|
|
assert len(doc_path) == 1
|
|
|
|
_urls = [
|
|
"https://haystack.deepset.ai/overview/v0.8.0/get-started",
|
|
"https://haystack.deepset.ai/overview/v0.7.0/get-started",
|
|
"https://haystack.deepset.ai/overview/v0.6.0/get-started",
|
|
]
|
|
doc_path = crawler.crawl(urls=_urls, crawler_depth=0)
|
|
assert len(doc_path) == 3
|
|
|
|
doc_path = crawler.crawl(urls=_url, crawler_depth=1)
|
|
assert len(doc_path) > 1
|
|
|
|
for json_file in doc_path:
|
|
assert isinstance(json_file, Path)
|
|
with open(json_file.absolute(), "r") as read_file:
|
|
data = json.load(read_file)
|
|
assert 'content' in data
|
|
assert 'meta' in data
|
|
assert isinstance(data['content'], str)
|
|
assert len(data['content'].split()) > 2
|
|
|
|
|
|
def test_crawler_filter_urls(tmp_path):
|
|
tmp_dir = tmp_path / "crawled_files"
|
|
_url = ["https://haystack.deepset.ai/overview/v0.8.0/"]
|
|
|
|
crawler = Crawler(output_dir=tmp_dir)
|
|
doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v0\.9\.0\/"])
|
|
assert len(doc_path) == 0
|
|
|
|
doc_path = crawler.crawl(urls=_url, filter_urls=["haystack\.deepset\.ai\/overview\/v0\.8\.0\/"])
|
|
assert len(doc_path) > 0
|
|
|
|
doc_path = crawler.crawl(urls=_url, filter_urls=["google\.com"])
|
|
assert len(doc_path) == 0
|
|
|
|
|
|
def test_crawler_content(tmp_path):
|
|
tmp_dir = tmp_path / "crawled_files"
|
|
|
|
partial_content_match: list = [
|
|
{"url": "https://haystack.deepset.ai/overview/v0.7.0/intro",
|
|
"partial_content": ["What is Haystack",
|
|
"Utilize all transformer based models",
|
|
"a Retriever-Reader pipeline in order",
|
|
"Passing on only a small candidate set",
|
|
"fast indexing and querying",
|
|
"Fine-tune models to your own domain",
|
|
"smoothly switch when new ones get published"]},
|
|
{"url": "https://haystack.deepset.ai/overview/v0.7.0/use-cases",
|
|
"partial_content": ["Semantic Search System",
|
|
"Store your documents in the database of ",
|
|
"results are chosen based on compatibility in",
|
|
"Apply a set of standard questions to each document",
|
|
"Return a NO_ANSWER if a given document",
|
|
"like what is the revenue forecast for 2020?",
|
|
"overview of academic papers and internal business"]}]
|
|
|
|
crawler = Crawler(output_dir=tmp_dir)
|
|
for _dict in partial_content_match:
|
|
url: str = _dict['url']
|
|
partial_content: list = _dict['partial_content']
|
|
|
|
doc_path = crawler.crawl(urls=[url], crawler_depth=0)
|
|
assert len(doc_path) == 1
|
|
|
|
for json_file in doc_path:
|
|
assert isinstance(json_file, Path)
|
|
with open(json_file.absolute(), "r") as read_file:
|
|
content = json.load(read_file)
|
|
assert isinstance(content['content'], str)
|
|
for partial_line in partial_content:
|
|
assert search(partial_line, content['content'])
|
|
assert partial_line in content['content']
|
|
|
|
|
|
def test_crawler_return_document(tmp_path):
|
|
tmp_dir = tmp_path / "crawled_files"
|
|
_url = ["https://haystack.deepset.ai/docs/v0.5.0/intromd"]
|
|
|
|
crawler = Crawler(output_dir=tmp_dir)
|
|
docs_path = crawler.crawl(urls=_url, crawler_depth=1)
|
|
results, _ = crawler.run(urls=_url, crawler_depth=1, return_documents=True)
|
|
documents = results['documents']
|
|
|
|
for json_file, document in zip(docs_path, documents):
|
|
assert isinstance(json_file, Path)
|
|
assert isinstance(document, dict)
|
|
|
|
with open(json_file.absolute(), "r") as read_file:
|
|
file_content = json.load(read_file)
|
|
assert file_content['meta'] == document['meta']
|
|
assert file_content['content'] == document['content']
|