feat: add web URL loader & refine indexing logics (#397)

* feat: add web URL loader & refine indexing logics

* fix: comfort mypy
This commit is contained in:
Tuan Anh Nguyen Dang (Tadashi_Cin) 2024-10-15 22:42:24 +07:00 committed by GitHub
parent 41966fcd5b
commit b113efc855
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 183 additions and 60 deletions

View File

@ -21,8 +21,10 @@ from kotaemon.loaders import (
PDFThumbnailReader,
TxtReader,
UnstructuredReader,
WebReader,
)
web_reader = WebReader()
unstructured = UnstructuredReader()
adobe_reader = AdobeReader()
azure_reader = AzureAIDocumentIntelligenceLoader(

View File

@ -10,6 +10,7 @@ from .ocr_loader import ImageReader, OCRReader
from .pdf_loader import PDFThumbnailReader
from .txt_loader import TxtReader
from .unstructured_loader import UnstructuredReader
from .web_loader import WebReader
__all__ = [
"AutoReader",
@ -28,4 +29,5 @@ __all__ = [
"AdobeReader",
"TxtReader",
"PDFThumbnailReader",
"WebReader",
]

View File

@ -0,0 +1,43 @@
from pathlib import Path
from typing import Optional
import requests
from decouple import config
from kotaemon.base import Document
from .base import BaseReader
JINA_API_KEY = config("JINA_API_KEY", default="")
JINA_URL = config("JINA_URL", default="https://r.jina.ai/")
class WebReader(BaseReader):
def run(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> list[Document]:
return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
def fetch_url(self, url: str):
# setup the request
api_url = f"https://r.jina.ai/{url}"
headers = {
"X-With-Links-Summary": "true",
}
if JINA_API_KEY:
headers["Authorization"] = f"Bearer {JINA_API_KEY}"
response = requests.get(api_url, headers=headers)
response.raise_for_status()
data = response.text
return data
def load_data(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> list[Document]:
file_path = str(file_path)
output = self.fetch_url(file_path)
metadata = extra_info or {}
return [Document(text=output, metadata=metadata)]

View File

@ -57,7 +57,7 @@ def prepare_graph_index_path(graph_id: str):
class GraphRAGIndexingPipeline(IndexDocumentPipeline):
"""GraphRAG specific indexing pipeline"""
def route(self, file_path: Path) -> IndexPipeline:
def route(self, file_path: str | Path) -> IndexPipeline:
"""Simply disable the splitter (chunking) for this pipeline"""
pipeline = super().route(file_path)
pipeline.splitter = None

View File

@ -32,7 +32,7 @@ class KnetIndexingPipeline(IndexDocumentPipeline):
},
}
def route(self, file_path: Path) -> IndexPipeline:
def route(self, file_path: str | Path) -> IndexPipeline:
"""Simply disable the splitter (chunking) for this pipeline"""
pipeline = super().route(file_path)
pipeline.splitter = None

View File

@ -39,6 +39,7 @@ from kotaemon.indices.ingests.files import (
adobe_reader,
azure_reader,
unstructured,
web_reader,
)
from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
@ -444,7 +445,7 @@ class IndexPipeline(BaseComponent):
session.add_all(nodes)
session.commit()
def get_id_if_exists(self, file_path: Path) -> Optional[str]:
def get_id_if_exists(self, file_path: str | Path) -> Optional[str]:
"""Check if the file is already indexed
Args:
@ -453,13 +454,14 @@ class IndexPipeline(BaseComponent):
Returns:
the file id if the file is indexed, otherwise None
"""
file_name = file_path.name if isinstance(file_path, Path) else file_path
if self.private:
cond: tuple = (
self.Source.name == file_path.name,
self.Source.name == file_name,
self.Source.user == self.user_id,
)
else:
cond = (self.Source.name == file_path.name,)
cond = (self.Source.name == file_name,)
with Session(engine) as session:
stmt = select(self.Source).where(*cond)
@ -469,6 +471,29 @@ class IndexPipeline(BaseComponent):
return None
def store_url(self, url: str) -> str:
"""Store URL into the database and storage, return the file id
Args:
url: the URL
Returns:
the file id
"""
file_hash = sha256(url.encode()).hexdigest()
source = self.Source(
name=url,
path=file_hash,
size=0,
user=self.user_id, # type: ignore
)
with Session(engine) as session:
session.add(source)
session.commit()
file_id = source.id
return file_id
def store_file(self, file_path: Path) -> str:
"""Store file into the database and storage, return the file id
@ -495,7 +520,7 @@ class IndexPipeline(BaseComponent):
return file_id
def finish(self, file_id: str, file_path: Path) -> str:
def finish(self, file_id: str, file_path: str | Path) -> str:
"""Finish the indexing"""
with Session(engine) as session:
stmt = select(self.Source).where(self.Source.id == file_id)
@ -561,37 +586,55 @@ class IndexPipeline(BaseComponent):
def stream(
self, file_path: str | Path, reindex: bool, **kwargs
) -> Generator[Document, None, tuple[str, list[Document]]]:
# check for duplication
file_path = Path(file_path).resolve()
# check if the file is already indexed
if isinstance(file_path, Path):
file_path = file_path.resolve()
file_id = self.get_id_if_exists(file_path)
if file_id is not None:
if not reindex:
raise ValueError(
f"File {file_path.name} already indexed. Please rerun with "
"reindex=True to force reindexing."
)
if isinstance(file_path, Path):
if file_id is not None:
if not reindex:
raise ValueError(
f"File {file_path.name} already indexed. Please rerun with "
"reindex=True to force reindexing."
)
else:
# remove the existing records
yield Document(
f" => Removing old {file_path.name}", channel="debug"
)
self.delete_file(file_id)
file_id = self.store_file(file_path)
else:
# remove the existing records
yield Document(f" => Removing old {file_path.name}", channel="debug")
self.delete_file(file_id)
# add record to db
file_id = self.store_file(file_path)
else:
# add record to db
file_id = self.store_file(file_path)
if file_id is not None:
raise ValueError(f"URL {file_path} already indexed.")
else:
# add record to db
file_id = self.store_url(file_path)
# extract the file
extra_info = default_file_metadata_func(str(file_path))
if isinstance(file_path, Path):
extra_info = default_file_metadata_func(str(file_path))
file_name = file_path.name
else:
extra_info = {"file_name": file_path}
file_name = file_path
extra_info["file_id"] = file_id
extra_info["collection_name"] = self.collection_name
yield Document(f" => Converting {file_path.name} to text", channel="debug")
yield Document(f" => Converting {file_name} to text", channel="debug")
docs = self.loader.load_data(file_path, extra_info=extra_info)
yield Document(f" => Converted {file_path.name} to text", channel="debug")
yield from self.handle_docs(docs, file_id, file_path.name)
yield Document(f" => Converted {file_name} to text", channel="debug")
yield from self.handle_docs(docs, file_id, file_name)
self.finish(file_id, file_path)
yield Document(f" => Finished indexing {file_path.name}", channel="debug")
yield Document(f" => Finished indexing {file_name}", channel="debug")
return file_id, docs
@ -658,20 +701,30 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
)
return obj
def route(self, file_path: Path) -> IndexPipeline:
def is_url(self, file_path: str | Path) -> bool:
return isinstance(file_path, str) and (
file_path.startswith("http://") or file_path.startswith("https://")
)
def route(self, file_path: str | Path) -> IndexPipeline:
"""Decide the pipeline based on the file type
Can subclass this method for a more elaborate pipeline routing strategy.
"""
_, chunk_size, chunk_overlap = dev_settings()
ext = file_path.suffix.lower()
reader = self.readers.get(ext, unstructured)
if reader is None:
raise NotImplementedError(
f"No supported pipeline to index {file_path.name}. Please specify "
"the suitable pipeline for this file type in the settings."
)
# check if file_path is a URL
if self.is_url(file_path):
reader = web_reader
else:
assert isinstance(file_path, Path)
ext = file_path.suffix.lower()
reader = self.readers.get(ext, unstructured)
if reader is None:
raise NotImplementedError(
f"No supported pipeline to index {file_path.name}. Please specify "
"the suitable pipeline for this file type in the settings."
)
print("Using reader", reader)
pipeline: IndexPipeline = IndexPipeline(
@ -715,9 +768,14 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
n_files = len(file_paths)
for idx, file_path in enumerate(file_paths):
file_path = Path(file_path)
if self.is_url(file_path):
file_name = file_path
else:
file_path = Path(file_path)
file_name = file_path.name
yield Document(
content=f"Indexing [{idx + 1}/{n_files}]: {file_path.name}",
content=f"Indexing [{idx + 1}/{n_files}]: {file_name}",
channel="debug",
)
@ -730,7 +788,11 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
file_ids.append(file_id)
errors.append(None)
yield Document(
content={"file_path": file_path, "status": "success"},
content={
"file_path": file_path,
"file_name": file_name,
"status": "success",
},
channel="index",
)
except Exception as e:
@ -740,6 +802,7 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
yield Document(
content={
"file_path": file_path,
"file_name": file_name,
"status": "failed",
"message": str(e),
},

View File

@ -111,18 +111,25 @@ class FileIndexPage(BasePage):
"""Build the UI of the app"""
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## File Upload")
with gr.Column() as self.upload:
self.files = File(
file_types=self._supported_file_types,
file_count="multiple",
container=True,
show_label=False,
)
with gr.Tab("Upload Files"):
self.files = File(
file_types=self._supported_file_types,
file_count="multiple",
container=True,
show_label=False,
)
msg = self.upload_instruction()
if msg:
gr.Markdown(msg)
msg = self.upload_instruction()
if msg:
gr.Markdown(msg)
with gr.Tab("Use Web Links"):
self.urls = gr.Textbox(
label="Input web URLs",
lines=8,
)
gr.Markdown("(separated by new line)")
with gr.Accordion("Advanced indexing options", open=True):
with gr.Row():
@ -525,6 +532,7 @@ class FileIndexPage(BasePage):
fn=self.index_fn,
inputs=[
self.files,
self.urls,
self.reindex,
self._app.settings_state,
self._app.user_id,
@ -670,28 +678,33 @@ class FileIndexPage(BasePage):
return remaining_files
def index_fn(
self, files, reindex: bool, settings, user_id
self, files, urls, reindex: bool, settings, user_id
) -> Generator[tuple[str, str], None, None]:
"""Upload and index the files
Args:
files: the list of files to be uploaded
urls: list of web URLs to be indexed
reindex: whether to reindex the files
selected_files: the list of files already selected
settings: the settings of the app
"""
if not files:
gr.Info("No uploaded file")
yield "", ""
return
if urls:
files = [it.strip() for it in urls.split("\n")]
errors = []
else:
if not files:
gr.Info("No uploaded file")
yield "", ""
return
files = self._may_extract_zip(files, flowsettings.KH_ZIP_INPUT_DIR)
files = self._may_extract_zip(files, flowsettings.KH_ZIP_INPUT_DIR)
errors = self.validate(files)
if errors:
gr.Warning(", ".join(errors))
yield "", ""
return
errors = self.validate(files)
if errors:
gr.Warning(", ".join(errors))
yield "", ""
return
gr.Info(f"Start indexing {len(files)} files...")
@ -708,10 +721,10 @@ class FileIndexPage(BasePage):
continue
if response.channel == "index":
if response.content["status"] == "success":
outputs.append(f"\u2705 | {response.content['file_path'].name}")
outputs.append(f"\u2705 | {response.content['file_name']}")
elif response.content["status"] == "failed":
outputs.append(
f"\u274c | {response.content['file_path'].name}: "
f"\u274c | {response.content['file_name']}: "
f"{response.content['message']}"
)
elif response.channel == "debug":
@ -764,7 +777,7 @@ class FileIndexPage(BasePage):
settings[f"index.options.{self._index.id}.reader_mode"] = "default"
settings[f"index.options.{self._index.id}.quick_index_mode"] = True
if to_process_files:
_iter = self.index_fn(to_process_files, reindex, settings, user_id)
_iter = self.index_fn(to_process_files, [], reindex, settings, user_id)
try:
while next(_iter):
pass
@ -844,7 +857,7 @@ class FileIndexPage(BasePage):
for p in exclude_patterns:
files = [f for f in files if not fnmatch.fnmatch(name=f, pat=p)]
yield from self.index_fn(files, reindex, settings, user_id)
yield from self.index_fn(files, [], reindex, settings, user_id)
def format_size_human_readable(self, num: float | str, suffix="B"):
try: