2023-02-03 15:39:24 -08:00
|
|
|
"""Read Microsoft Word files."""
|
|
|
|
|
|
|
|
from pathlib import Path
|
2023-02-03 23:38:12 -08:00
|
|
|
from typing import Dict, List, Optional
|
2023-02-03 15:39:24 -08:00
|
|
|
|
2023-02-20 21:46:58 -08:00
|
|
|
from llama_index.readers.base import BaseReader
|
|
|
|
from llama_index.readers.schema.base import Document
|
2023-02-03 15:39:24 -08:00
|
|
|
|
|
|
|
|
|
|
|
class DocxReader(BaseReader):
|
|
|
|
"""Docx Reader."""
|
|
|
|
|
|
|
|
def load_data(
|
|
|
|
self, file: Path, extra_info: Optional[Dict] = None
|
|
|
|
) -> List[Document]:
|
|
|
|
"""Parse file."""
|
2023-02-03 20:41:20 -08:00
|
|
|
import docx2txt
|
2023-02-03 15:39:24 -08:00
|
|
|
|
|
|
|
text = docx2txt.process(file)
|
2023-05-21 21:43:35 +05:30
|
|
|
metadata = {"file_name": file.name}
|
2023-02-03 15:39:24 -08:00
|
|
|
|
2023-05-21 21:43:35 +05:30
|
|
|
if extra_info is not None:
|
|
|
|
metadata.update(extra_info)
|
|
|
|
|
|
|
|
return [Document(text, extra_info=metadata)]
|