26 lines
624 B
Python
Raw Normal View History

"""Read Microsoft Word files."""
from pathlib import Path
2023-02-03 23:38:12 -08:00
from typing import Dict, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
class DocxReader(BaseReader):
"""Docx Reader."""
def load_data(
self, file: Path, extra_info: Optional[Dict] = None
) -> List[Document]:
"""Parse file."""
2023-02-03 20:41:20 -08:00
import docx2txt
text = docx2txt.process(file)
2023-05-21 21:43:35 +05:30
metadata = {"file_name": file.name}
2023-05-21 21:43:35 +05:30
if extra_info is not None:
metadata.update(extra_info)
return [Document(text, extra_info=metadata)]