diff --git a/loader_hub/library.json b/loader_hub/library.json index d516cce3..a94d37f6 100644 --- a/loader_hub/library.json +++ b/loader_hub/library.json @@ -376,6 +376,102 @@ "id": "airtable", "author": "smyja" }, + "HatenaBlogReader": { + "id": "hatena_blog", + "author": "Shoya SHIRAKI", + "keywords": [ + "hatena", + "blog" + ] + }, + "OpendalReader": { + "id": "opendal_reader", + "author": "OpenDAL Contributors", + "keywords": [ + "storage" + ] + }, + "OpendalS3Reader": { + "id": "opendal_reader/s3", + "author": "OpenDAL Contributors", + "keywords": [ + "storage", + "s3" + ] + }, + "OpendalAzblobReader": { + "id": "opendal_reader/azblob", + "author": "OpenDAL Contributors", + "keywords": [ + "storage", + "azblob" + ] + }, + "OpendalGcsReader": { + "id": "opendal_reader/gcs", + "author": "OpenDAL Contributors", + "keywords": [ + "storage", + "gcs" + ] + }, + "ConfluenceReader": { + "id": "confluence", + "author": "zywilliamli" + }, + "ChatGPTRetrievalPluginReader": { + "id": "chatgpt_plugin", + "author": "jerryjliu" + }, + "JiraReader": { + "id": "jira", + "author": "bearguy", + "keywords": [ + "jira" + ] + }, + "UnstructuredURLLoader": { + "id": "web/unstructured_web", + "author": "kravetsmic", + "keywords": [ + "unstructured.io", + "url" + ] + }, + "GoogleSheetsReader": { + "id": "google_sheets", + "author": "piroz" + }, + "FeedlyRssReader": { + "id": "feedly_rss", + "author": "kychanbp", + "keywords": [ + "feedly", + "rss" + ] + }, + "FlatPdfReader": { + "id": "file/flat_pdf", + "author": "emmanuel-oliveira", + "keywords": [ + "pdf", + "flat", + "flattened" + ] + }, + "MilvusReader": { + "id": "milvus", + "author": "filip-halt" + }, + "StackoverflowReader": { + "id": "stackoverflow", + "author": "allen-munsch", + "keywords": [ + "posts", + "questions", + "answers" + ] + }, "ZulipReader": { "id": "zulip", "author": "plurigrid" diff --git a/loader_hub/stackoverflow/README.md b/loader_hub/stackoverflow/README.md new file mode 100644 index 00000000..3078a45f --- /dev/null +++ b/loader_hub/stackoverflow/README.md @@ -0,0 +1,32 @@ +# StackoverflowReader (In Beta) + +Using the Stackoverflow API, this class will read the Stackoverflow Teams API and return a list of questions and answers based on posts. + +It also supports caching the results to a local directory, so that you can run the load_data() method multiple times without hitting the API. + +## getting a token + +Visit: https://stackoverflowteams.com/users/pats/ + +1. Click Create a new PAT +3. Name the token, and pick the team scope +4. Select an expiration date +5. Click Create + +Add this to your env, or to the instantiation of the `StackoverflowReader(pa_token, team_name, cache_dir='./stackoverflow_cache')` + +```bash +export STACKOVERFLOW_PAT=your_token +export STACKOVERFLOW_TEAM_NAME=your_team +``` + + + +Other features which could be added: + + - Add articles + - Add comments + - Add tags + - Add users + - Add votes + - Add badges diff --git a/loader_hub/stackoverflow/__init__.py b/loader_hub/stackoverflow/__init__.py new file mode 100644 index 00000000..1d464056 --- /dev/null +++ b/loader_hub/stackoverflow/__init__.py @@ -0,0 +1 @@ +"""Init file.""" diff --git a/loader_hub/stackoverflow/base.py b/loader_hub/stackoverflow/base.py new file mode 100644 index 00000000..fefb2c4d --- /dev/null +++ b/loader_hub/stackoverflow/base.py @@ -0,0 +1,153 @@ +import json +import logging +import os +import threading +import time +from dataclasses import dataclass +from datetime import datetime +from functools import wraps +from typing import List, Optional + +import requests +from llama_index.readers.base import BaseReader +from llama_index.readers.schema.base import Document + +logger = logging.getLogger(__name__) + + +@dataclass +class StackOverflowPost: + link: str + score: int + last_activity_date: int + creation_date: int + post_id: Optional[int] = None + post_type: Optional[str] = None + body_markdown: Optional[str] = None + owner_account_id: Optional[int] = None + owner_reputation: Optional[int] = None + owner_user_id: Optional[int] = None + owner_user_type: Optional[str] = None + owner_profile_image: Optional[str] = None + owner_display_name: Optional[str] = None + owner_link: Optional[str] = None + title: Optional[str] = None + last_edit_date: Optional[str] = None + tags: Optional[List[str]] = None + view_count: Optional[int] = None + article_id: Optional[int] = None + article_type: Optional[str] = None + +def rate_limit(*, allowed_per_second: int): + max_period = 1.0 / allowed_per_second + last_call = [time.perf_counter()] + lock = threading.Lock() + + def decorate(func): + @wraps(func) + def limit(*args, **kwargs): + with lock: + elapsed = time.perf_counter() - last_call[0] + hold = max_period - elapsed + if hold > 0: + time.sleep(hold) + result = func(*args, **kwargs) + last_call[0] = time.perf_counter() + return result + return limit + return decorate + +@rate_limit(allowed_per_second=15) +def rate_limited_get(url, headers): + ''' + https://api.stackoverflowteams.com/docs/throttle + https://api.stackexchange.com/docs/throttle + Every application is subject to an IP based concurrent request throttle. + If a single IP is making more than 30 requests a second, new requests will be dropped. + The exact ban period is subject to change, but will be on the order of 30 seconds to a few minutes typically. + Note that exactly what response an application gets (in terms of HTTP code, text, and so on) + is undefined when subject to this ban; we consider > 30 request/sec per IP to be very abusive and thus cut the requests off very harshly. + ''' + resp = requests.get(url, headers=headers) + if resp.status_code == 429: + logger.warning('Rate limited, sleeping for 5 minutes') + time.sleep(300) + return rate_limited_get(url, headers) + return resp + + +class StackoverflowReader(BaseReader): + + def __init__(self, api_key: str = None, team_name: str = None, cache_dir: str = None) -> None: + self._api_key = api_key or os.environ.get('STACKOVERFLOW_PAT') + self._team_name = team_name or os.environ.get('STACKOVERFLOW_TEAM_NAME') + self._last_index_time = None # TODO + self._cache_dir = cache_dir + if self._cache_dir: + os.makedirs(self._cache_dir, exist_ok=True) + + def load_data(self, page: int = 1, doc_type: str = 'posts', limit: int = 50) -> List[Document]: + data = [] + has_more = True + + while has_more: + url = self.build_url(page, doc_type) + headers = {'X-API-Access-Token': self._api_key} + fp = os.path.join(self._cache_dir, f'{doc_type}_{page}.json') + response = {} + if self._cache_dir and os.path.exists(fp) and os.path.getsize(fp) > 0: + try: + with open(fp, 'r') as f: + response = f.read() + response = json.loads(response) + except Exception as e: + logger.error(e) + if not response: + response = rate_limited_get(url, headers) + response.raise_for_status() + if self._cache_dir: + with open(os.path.join(self._cache_dir, f'{doc_type}_{page}.json'), 'w') as f: + f.write(response.content.decode('utf-8')) + logger.info(f'Wrote {fp} to cache') + response = response.json() + has_more = response['has_more'] + items = response['items'] + logger.info(f'Fetched {len(items)} {doc_type} from Stack Overflow') + + for item_dict in items: + owner_fields = {} + if 'owner' in item_dict: + owner_fields = {f"owner_{k}": v for k, v in item_dict.pop('owner').items()} + if 'title' not in item_dict: + item_dict['title'] = item_dict['link'] + post = StackOverflowPost(**item_dict, **owner_fields) + # TODO: filter out old posts + # last_modified = datetime.fromtimestamp(post.last_edit_date or post.last_activity_date) + # if last_modified < self._last_index_time: + # return data + + post_document = Document(text=post.body_markdown, doc_id=post.post_id, + extra_info={"title": post.title, "author": post.owner_display_name, + "timestamp": datetime.fromtimestamp(post.creation_date), "location": post.link, + "url": post.link, "author_image_url": post.owner_profile_image, + "type": post.post_type}) + data.append(post_document) + + if has_more: + page += 1 + + return data + + def build_url(self, page: int, doc_type: str) -> str: + team_fragment = f'&team={self._team_name}' + # not sure if this filter is shared globally, or only to a particular team + filter_fragment = '&filter=!nOedRLbqzB' + page_fragment = f'&page={page}' + url = f'https://api.stackoverflowteams.com/2.3/{doc_type}?{team_fragment}{filter_fragment}{page_fragment}' + return url + + + +if __name__ == "__main__": + reader = StackoverflowReader(os.environ.get('STACKOVERFLOW_PAT'), os.environ.get('STACKOVERFLOW_TEAM_NAME'), cache_dir='./stackoverflow_cache') + # reader.load_data() \ No newline at end of file diff --git a/loader_hub/stackoverflow/requirements.txt b/loader_hub/stackoverflow/requirements.txt new file mode 100644 index 00000000..7ee70780 --- /dev/null +++ b/loader_hub/stackoverflow/requirements.txt @@ -0,0 +1,2 @@ +requests +llama_index