mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-13 19:21:15 +00:00

* cr * cr * cr --------- Co-authored-by: Jerry Liu <jerry@robustintelligence.com> Co-authored-by: Jesse Zhang <jessetanzhang@gmail.com>
43 lines
1.3 KiB
Python
43 lines
1.3 KiB
Python
"""Unstructured file reader.
|
|
|
|
A parser for unstructured text files using Unstructured.io.
|
|
Supports .txt, .docx, .pptx, .jpg, .png, .eml, .html, and .pdf documents.
|
|
|
|
"""
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
from llama_index.readers.base import BaseReader
|
|
from llama_index.readers.schema.base import Document
|
|
|
|
|
|
class UnstructuredReader(BaseReader):
|
|
"""General unstructured text reader for a variety of files."""
|
|
|
|
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
"""Init params."""
|
|
super().__init__(*args, **kwargs)
|
|
|
|
# Prerequisite for Unstructured.io to work
|
|
import nltk
|
|
|
|
nltk.download("punkt")
|
|
nltk.download("averaged_perceptron_tagger")
|
|
|
|
def load_data(
|
|
self,
|
|
file: Path,
|
|
extra_info: Optional[Dict] = None,
|
|
split_documents: Optional[bool] = False,
|
|
) -> List[Document]:
|
|
"""Parse file."""
|
|
from unstructured.partition.auto import partition
|
|
|
|
elements = partition(str(file))
|
|
text_chunks = [" ".join(str(el).split()) for el in elements]
|
|
|
|
if split_documents:
|
|
return [Document(chunk, extra_info=extra_info) for chunk in text_chunks]
|
|
else:
|
|
return [Document("\n\n".join(text_chunks), extra_info=extra_info)]
|