mirror of
https://github.com/Cinnamon/kotaemon.git
synced 2025-06-26 23:19:56 +00:00
feat: add web URL loader & refine indexing logics (#397)
* feat: add web URL loader & refine indexing logics * fix: comfort mypy
This commit is contained in:
parent
41966fcd5b
commit
b113efc855
@ -21,8 +21,10 @@ from kotaemon.loaders import (
|
||||
PDFThumbnailReader,
|
||||
TxtReader,
|
||||
UnstructuredReader,
|
||||
WebReader,
|
||||
)
|
||||
|
||||
web_reader = WebReader()
|
||||
unstructured = UnstructuredReader()
|
||||
adobe_reader = AdobeReader()
|
||||
azure_reader = AzureAIDocumentIntelligenceLoader(
|
||||
|
@ -10,6 +10,7 @@ from .ocr_loader import ImageReader, OCRReader
|
||||
from .pdf_loader import PDFThumbnailReader
|
||||
from .txt_loader import TxtReader
|
||||
from .unstructured_loader import UnstructuredReader
|
||||
from .web_loader import WebReader
|
||||
|
||||
__all__ = [
|
||||
"AutoReader",
|
||||
@ -28,4 +29,5 @@ __all__ = [
|
||||
"AdobeReader",
|
||||
"TxtReader",
|
||||
"PDFThumbnailReader",
|
||||
"WebReader",
|
||||
]
|
||||
|
43
libs/kotaemon/kotaemon/loaders/web_loader.py
Normal file
43
libs/kotaemon/kotaemon/loaders/web_loader.py
Normal file
@ -0,0 +1,43 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from decouple import config
|
||||
|
||||
from kotaemon.base import Document
|
||||
|
||||
from .base import BaseReader
|
||||
|
||||
JINA_API_KEY = config("JINA_API_KEY", default="")
|
||||
JINA_URL = config("JINA_URL", default="https://r.jina.ai/")
|
||||
|
||||
|
||||
class WebReader(BaseReader):
|
||||
def run(
|
||||
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
|
||||
) -> list[Document]:
|
||||
return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
|
||||
|
||||
def fetch_url(self, url: str):
|
||||
# setup the request
|
||||
api_url = f"https://r.jina.ai/{url}"
|
||||
headers = {
|
||||
"X-With-Links-Summary": "true",
|
||||
}
|
||||
if JINA_API_KEY:
|
||||
headers["Authorization"] = f"Bearer {JINA_API_KEY}"
|
||||
|
||||
response = requests.get(api_url, headers=headers)
|
||||
response.raise_for_status()
|
||||
|
||||
data = response.text
|
||||
return data
|
||||
|
||||
def load_data(
|
||||
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
|
||||
) -> list[Document]:
|
||||
file_path = str(file_path)
|
||||
output = self.fetch_url(file_path)
|
||||
metadata = extra_info or {}
|
||||
|
||||
return [Document(text=output, metadata=metadata)]
|
@ -57,7 +57,7 @@ def prepare_graph_index_path(graph_id: str):
|
||||
class GraphRAGIndexingPipeline(IndexDocumentPipeline):
|
||||
"""GraphRAG specific indexing pipeline"""
|
||||
|
||||
def route(self, file_path: Path) -> IndexPipeline:
|
||||
def route(self, file_path: str | Path) -> IndexPipeline:
|
||||
"""Simply disable the splitter (chunking) for this pipeline"""
|
||||
pipeline = super().route(file_path)
|
||||
pipeline.splitter = None
|
||||
|
@ -32,7 +32,7 @@ class KnetIndexingPipeline(IndexDocumentPipeline):
|
||||
},
|
||||
}
|
||||
|
||||
def route(self, file_path: Path) -> IndexPipeline:
|
||||
def route(self, file_path: str | Path) -> IndexPipeline:
|
||||
"""Simply disable the splitter (chunking) for this pipeline"""
|
||||
pipeline = super().route(file_path)
|
||||
pipeline.splitter = None
|
||||
|
@ -39,6 +39,7 @@ from kotaemon.indices.ingests.files import (
|
||||
adobe_reader,
|
||||
azure_reader,
|
||||
unstructured,
|
||||
web_reader,
|
||||
)
|
||||
from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring
|
||||
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
|
||||
@ -444,7 +445,7 @@ class IndexPipeline(BaseComponent):
|
||||
session.add_all(nodes)
|
||||
session.commit()
|
||||
|
||||
def get_id_if_exists(self, file_path: Path) -> Optional[str]:
|
||||
def get_id_if_exists(self, file_path: str | Path) -> Optional[str]:
|
||||
"""Check if the file is already indexed
|
||||
|
||||
Args:
|
||||
@ -453,13 +454,14 @@ class IndexPipeline(BaseComponent):
|
||||
Returns:
|
||||
the file id if the file is indexed, otherwise None
|
||||
"""
|
||||
file_name = file_path.name if isinstance(file_path, Path) else file_path
|
||||
if self.private:
|
||||
cond: tuple = (
|
||||
self.Source.name == file_path.name,
|
||||
self.Source.name == file_name,
|
||||
self.Source.user == self.user_id,
|
||||
)
|
||||
else:
|
||||
cond = (self.Source.name == file_path.name,)
|
||||
cond = (self.Source.name == file_name,)
|
||||
|
||||
with Session(engine) as session:
|
||||
stmt = select(self.Source).where(*cond)
|
||||
@ -469,6 +471,29 @@ class IndexPipeline(BaseComponent):
|
||||
|
||||
return None
|
||||
|
||||
def store_url(self, url: str) -> str:
|
||||
"""Store URL into the database and storage, return the file id
|
||||
|
||||
Args:
|
||||
url: the URL
|
||||
|
||||
Returns:
|
||||
the file id
|
||||
"""
|
||||
file_hash = sha256(url.encode()).hexdigest()
|
||||
source = self.Source(
|
||||
name=url,
|
||||
path=file_hash,
|
||||
size=0,
|
||||
user=self.user_id, # type: ignore
|
||||
)
|
||||
with Session(engine) as session:
|
||||
session.add(source)
|
||||
session.commit()
|
||||
file_id = source.id
|
||||
|
||||
return file_id
|
||||
|
||||
def store_file(self, file_path: Path) -> str:
|
||||
"""Store file into the database and storage, return the file id
|
||||
|
||||
@ -495,7 +520,7 @@ class IndexPipeline(BaseComponent):
|
||||
|
||||
return file_id
|
||||
|
||||
def finish(self, file_id: str, file_path: Path) -> str:
|
||||
def finish(self, file_id: str, file_path: str | Path) -> str:
|
||||
"""Finish the indexing"""
|
||||
with Session(engine) as session:
|
||||
stmt = select(self.Source).where(self.Source.id == file_id)
|
||||
@ -561,37 +586,55 @@ class IndexPipeline(BaseComponent):
|
||||
def stream(
|
||||
self, file_path: str | Path, reindex: bool, **kwargs
|
||||
) -> Generator[Document, None, tuple[str, list[Document]]]:
|
||||
# check for duplication
|
||||
file_path = Path(file_path).resolve()
|
||||
# check if the file is already indexed
|
||||
if isinstance(file_path, Path):
|
||||
file_path = file_path.resolve()
|
||||
|
||||
file_id = self.get_id_if_exists(file_path)
|
||||
if file_id is not None:
|
||||
if not reindex:
|
||||
raise ValueError(
|
||||
f"File {file_path.name} already indexed. Please rerun with "
|
||||
"reindex=True to force reindexing."
|
||||
)
|
||||
|
||||
if isinstance(file_path, Path):
|
||||
if file_id is not None:
|
||||
if not reindex:
|
||||
raise ValueError(
|
||||
f"File {file_path.name} already indexed. Please rerun with "
|
||||
"reindex=True to force reindexing."
|
||||
)
|
||||
else:
|
||||
# remove the existing records
|
||||
yield Document(
|
||||
f" => Removing old {file_path.name}", channel="debug"
|
||||
)
|
||||
self.delete_file(file_id)
|
||||
file_id = self.store_file(file_path)
|
||||
else:
|
||||
# remove the existing records
|
||||
yield Document(f" => Removing old {file_path.name}", channel="debug")
|
||||
self.delete_file(file_id)
|
||||
# add record to db
|
||||
file_id = self.store_file(file_path)
|
||||
else:
|
||||
# add record to db
|
||||
file_id = self.store_file(file_path)
|
||||
if file_id is not None:
|
||||
raise ValueError(f"URL {file_path} already indexed.")
|
||||
else:
|
||||
# add record to db
|
||||
file_id = self.store_url(file_path)
|
||||
|
||||
# extract the file
|
||||
extra_info = default_file_metadata_func(str(file_path))
|
||||
if isinstance(file_path, Path):
|
||||
extra_info = default_file_metadata_func(str(file_path))
|
||||
file_name = file_path.name
|
||||
else:
|
||||
extra_info = {"file_name": file_path}
|
||||
file_name = file_path
|
||||
|
||||
extra_info["file_id"] = file_id
|
||||
extra_info["collection_name"] = self.collection_name
|
||||
|
||||
yield Document(f" => Converting {file_path.name} to text", channel="debug")
|
||||
yield Document(f" => Converting {file_name} to text", channel="debug")
|
||||
docs = self.loader.load_data(file_path, extra_info=extra_info)
|
||||
yield Document(f" => Converted {file_path.name} to text", channel="debug")
|
||||
yield from self.handle_docs(docs, file_id, file_path.name)
|
||||
yield Document(f" => Converted {file_name} to text", channel="debug")
|
||||
yield from self.handle_docs(docs, file_id, file_name)
|
||||
|
||||
self.finish(file_id, file_path)
|
||||
|
||||
yield Document(f" => Finished indexing {file_path.name}", channel="debug")
|
||||
yield Document(f" => Finished indexing {file_name}", channel="debug")
|
||||
return file_id, docs
|
||||
|
||||
|
||||
@ -658,20 +701,30 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||
)
|
||||
return obj
|
||||
|
||||
def route(self, file_path: Path) -> IndexPipeline:
|
||||
def is_url(self, file_path: str | Path) -> bool:
|
||||
return isinstance(file_path, str) and (
|
||||
file_path.startswith("http://") or file_path.startswith("https://")
|
||||
)
|
||||
|
||||
def route(self, file_path: str | Path) -> IndexPipeline:
|
||||
"""Decide the pipeline based on the file type
|
||||
|
||||
Can subclass this method for a more elaborate pipeline routing strategy.
|
||||
"""
|
||||
_, chunk_size, chunk_overlap = dev_settings()
|
||||
|
||||
ext = file_path.suffix.lower()
|
||||
reader = self.readers.get(ext, unstructured)
|
||||
if reader is None:
|
||||
raise NotImplementedError(
|
||||
f"No supported pipeline to index {file_path.name}. Please specify "
|
||||
"the suitable pipeline for this file type in the settings."
|
||||
)
|
||||
# check if file_path is a URL
|
||||
if self.is_url(file_path):
|
||||
reader = web_reader
|
||||
else:
|
||||
assert isinstance(file_path, Path)
|
||||
ext = file_path.suffix.lower()
|
||||
reader = self.readers.get(ext, unstructured)
|
||||
if reader is None:
|
||||
raise NotImplementedError(
|
||||
f"No supported pipeline to index {file_path.name}. Please specify "
|
||||
"the suitable pipeline for this file type in the settings."
|
||||
)
|
||||
|
||||
print("Using reader", reader)
|
||||
pipeline: IndexPipeline = IndexPipeline(
|
||||
@ -715,9 +768,14 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||
|
||||
n_files = len(file_paths)
|
||||
for idx, file_path in enumerate(file_paths):
|
||||
file_path = Path(file_path)
|
||||
if self.is_url(file_path):
|
||||
file_name = file_path
|
||||
else:
|
||||
file_path = Path(file_path)
|
||||
file_name = file_path.name
|
||||
|
||||
yield Document(
|
||||
content=f"Indexing [{idx + 1}/{n_files}]: {file_path.name}",
|
||||
content=f"Indexing [{idx + 1}/{n_files}]: {file_name}",
|
||||
channel="debug",
|
||||
)
|
||||
|
||||
@ -730,7 +788,11 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||
file_ids.append(file_id)
|
||||
errors.append(None)
|
||||
yield Document(
|
||||
content={"file_path": file_path, "status": "success"},
|
||||
content={
|
||||
"file_path": file_path,
|
||||
"file_name": file_name,
|
||||
"status": "success",
|
||||
},
|
||||
channel="index",
|
||||
)
|
||||
except Exception as e:
|
||||
@ -740,6 +802,7 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||
yield Document(
|
||||
content={
|
||||
"file_path": file_path,
|
||||
"file_name": file_name,
|
||||
"status": "failed",
|
||||
"message": str(e),
|
||||
},
|
||||
|
@ -111,18 +111,25 @@ class FileIndexPage(BasePage):
|
||||
"""Build the UI of the app"""
|
||||
with gr.Row():
|
||||
with gr.Column(scale=1):
|
||||
gr.Markdown("## File Upload")
|
||||
with gr.Column() as self.upload:
|
||||
self.files = File(
|
||||
file_types=self._supported_file_types,
|
||||
file_count="multiple",
|
||||
container=True,
|
||||
show_label=False,
|
||||
)
|
||||
with gr.Tab("Upload Files"):
|
||||
self.files = File(
|
||||
file_types=self._supported_file_types,
|
||||
file_count="multiple",
|
||||
container=True,
|
||||
show_label=False,
|
||||
)
|
||||
|
||||
msg = self.upload_instruction()
|
||||
if msg:
|
||||
gr.Markdown(msg)
|
||||
msg = self.upload_instruction()
|
||||
if msg:
|
||||
gr.Markdown(msg)
|
||||
|
||||
with gr.Tab("Use Web Links"):
|
||||
self.urls = gr.Textbox(
|
||||
label="Input web URLs",
|
||||
lines=8,
|
||||
)
|
||||
gr.Markdown("(separated by new line)")
|
||||
|
||||
with gr.Accordion("Advanced indexing options", open=True):
|
||||
with gr.Row():
|
||||
@ -525,6 +532,7 @@ class FileIndexPage(BasePage):
|
||||
fn=self.index_fn,
|
||||
inputs=[
|
||||
self.files,
|
||||
self.urls,
|
||||
self.reindex,
|
||||
self._app.settings_state,
|
||||
self._app.user_id,
|
||||
@ -670,28 +678,33 @@ class FileIndexPage(BasePage):
|
||||
return remaining_files
|
||||
|
||||
def index_fn(
|
||||
self, files, reindex: bool, settings, user_id
|
||||
self, files, urls, reindex: bool, settings, user_id
|
||||
) -> Generator[tuple[str, str], None, None]:
|
||||
"""Upload and index the files
|
||||
|
||||
Args:
|
||||
files: the list of files to be uploaded
|
||||
urls: list of web URLs to be indexed
|
||||
reindex: whether to reindex the files
|
||||
selected_files: the list of files already selected
|
||||
settings: the settings of the app
|
||||
"""
|
||||
if not files:
|
||||
gr.Info("No uploaded file")
|
||||
yield "", ""
|
||||
return
|
||||
if urls:
|
||||
files = [it.strip() for it in urls.split("\n")]
|
||||
errors = []
|
||||
else:
|
||||
if not files:
|
||||
gr.Info("No uploaded file")
|
||||
yield "", ""
|
||||
return
|
||||
|
||||
files = self._may_extract_zip(files, flowsettings.KH_ZIP_INPUT_DIR)
|
||||
files = self._may_extract_zip(files, flowsettings.KH_ZIP_INPUT_DIR)
|
||||
|
||||
errors = self.validate(files)
|
||||
if errors:
|
||||
gr.Warning(", ".join(errors))
|
||||
yield "", ""
|
||||
return
|
||||
errors = self.validate(files)
|
||||
if errors:
|
||||
gr.Warning(", ".join(errors))
|
||||
yield "", ""
|
||||
return
|
||||
|
||||
gr.Info(f"Start indexing {len(files)} files...")
|
||||
|
||||
@ -708,10 +721,10 @@ class FileIndexPage(BasePage):
|
||||
continue
|
||||
if response.channel == "index":
|
||||
if response.content["status"] == "success":
|
||||
outputs.append(f"\u2705 | {response.content['file_path'].name}")
|
||||
outputs.append(f"\u2705 | {response.content['file_name']}")
|
||||
elif response.content["status"] == "failed":
|
||||
outputs.append(
|
||||
f"\u274c | {response.content['file_path'].name}: "
|
||||
f"\u274c | {response.content['file_name']}: "
|
||||
f"{response.content['message']}"
|
||||
)
|
||||
elif response.channel == "debug":
|
||||
@ -764,7 +777,7 @@ class FileIndexPage(BasePage):
|
||||
settings[f"index.options.{self._index.id}.reader_mode"] = "default"
|
||||
settings[f"index.options.{self._index.id}.quick_index_mode"] = True
|
||||
if to_process_files:
|
||||
_iter = self.index_fn(to_process_files, reindex, settings, user_id)
|
||||
_iter = self.index_fn(to_process_files, [], reindex, settings, user_id)
|
||||
try:
|
||||
while next(_iter):
|
||||
pass
|
||||
@ -844,7 +857,7 @@ class FileIndexPage(BasePage):
|
||||
for p in exclude_patterns:
|
||||
files = [f for f in files if not fnmatch.fnmatch(name=f, pat=p)]
|
||||
|
||||
yield from self.index_fn(files, reindex, settings, user_id)
|
||||
yield from self.index_fn(files, [], reindex, settings, user_id)
|
||||
|
||||
def format_size_human_readable(self, num: float | str, suffix="B"):
|
||||
try:
|
||||
|
Loading…
x
Reference in New Issue
Block a user