mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-08-14 19:51:25 +00:00
43 lines
1.2 KiB
Python
43 lines
1.2 KiB
Python
![]() |
"""Epub Reader.
|
||
|
|
||
|
A parser for epub files.
|
||
|
"""
|
||
|
|
||
|
from pathlib import Path
|
||
|
from typing import Dict, Optional, List
|
||
|
|
||
|
from gpt_index.readers.base import BaseReader
|
||
|
from gpt_index.readers.schema.base import Document
|
||
|
|
||
|
|
||
|
class EpubReader(BaseReader):
|
||
|
"""Epub Parser."""
|
||
|
|
||
|
def load_data(
|
||
|
self, file: Path, extra_info: Optional[Dict] = None
|
||
|
) -> List[Document]:
|
||
|
"""Parse file."""
|
||
|
try:
|
||
|
import ebooklib
|
||
|
from ebooklib import epub
|
||
|
except ImportError:
|
||
|
raise ValueError("`EbookLib` is required to read Epub files.")
|
||
|
try:
|
||
|
import html2text
|
||
|
except ImportError:
|
||
|
raise ValueError("`html2text` is required to parse Epub files.")
|
||
|
|
||
|
text_list = []
|
||
|
book = epub.read_epub(file, options={"ignore_ncx": True})
|
||
|
|
||
|
# Iterate through all chapters.
|
||
|
for item in book.get_items():
|
||
|
# Chapters are typically located in epub documents items.
|
||
|
if item.get_type() == ebooklib.ITEM_DOCUMENT:
|
||
|
text_list.append(
|
||
|
html2text.html2text(item.get_content().decode("utf-8"))
|
||
|
)
|
||
|
|
||
|
text = "\n".join(text_list)
|
||
|
return [Document(text, extra_info=extra_info)]
|