Beta version for stackoverflow teams connections (#188)

Co-authored-by: Jerry Liu <jerryjliu98@gmail.com>
2025-12-05 19:40:07 +00:00 · 2023-04-13 02:42:01 -05:00 · 2023-04-13 02:42:01 -05:00 · f64be68bcc
commit f64be68bcc
parent 9a98b46413
5 changed files with 284 additions and 0 deletions
--- a/loader_hub/library.json
+++ b/loader_hub/library.json
@ -376,6 +376,102 @@
    "id": "airtable",
    "author": "smyja"
  },
+  "HatenaBlogReader": {
+    "id": "hatena_blog",
+    "author": "Shoya SHIRAKI",
+    "keywords": [
+      "hatena",
+      "blog"
+    ]
+  },
+  "OpendalReader": {
+    "id": "opendal_reader",
+    "author": "OpenDAL Contributors",
+    "keywords": [
+      "storage"
+    ]
+  },
+  "OpendalS3Reader": {
+    "id": "opendal_reader/s3",
+    "author": "OpenDAL Contributors",
+    "keywords": [
+      "storage",
+      "s3"
+    ]
+  },
+  "OpendalAzblobReader": {
+    "id": "opendal_reader/azblob",
+    "author": "OpenDAL Contributors",
+    "keywords": [
+      "storage",
+      "azblob"
+    ]
+  },
+  "OpendalGcsReader": {
+    "id": "opendal_reader/gcs",
+    "author": "OpenDAL Contributors",
+    "keywords": [
+      "storage",
+      "gcs"
+    ]
+  },
+  "ConfluenceReader": {
+    "id": "confluence",
+    "author": "zywilliamli"
+  },
+  "ChatGPTRetrievalPluginReader": {
+    "id": "chatgpt_plugin",
+    "author": "jerryjliu"
+  },
+  "JiraReader": {
+    "id": "jira",
+    "author": "bearguy",
+    "keywords": [
+      "jira"
+    ]
+  },
+  "UnstructuredURLLoader": {
+    "id": "web/unstructured_web",
+    "author": "kravetsmic",
+    "keywords": [
+      "unstructured.io",
+      "url"
+    ]
+  },
+  "GoogleSheetsReader": {
+    "id": "google_sheets",
+    "author": "piroz"
+  },
+  "FeedlyRssReader": {
+    "id": "feedly_rss",
+    "author": "kychanbp",
+    "keywords": [
+      "feedly",
+      "rss"
+    ]
+  },
+  "FlatPdfReader": {
+    "id": "file/flat_pdf",
+    "author": "emmanuel-oliveira",
+    "keywords": [
+      "pdf",
+      "flat",
+      "flattened"
+    ]
+  },
+  "MilvusReader": {
+    "id": "milvus",
+    "author": "filip-halt"
+  },
+  "StackoverflowReader": {
+    "id": "stackoverflow",
+    "author": "allen-munsch",
+    "keywords": [
+      "posts",
+      "questions",
+      "answers"
+    ]
+  },
  "ZulipReader": {
    "id": "zulip",
    "author": "plurigrid"
--- a/loader_hub/stackoverflow/README.md
+++ b/loader_hub/stackoverflow/README.md
@ -0,0 +1,32 @@
+# StackoverflowReader (In Beta)
+
+Using the Stackoverflow API, this class will read the Stackoverflow Teams API and return a list of questions and answers based on posts.
+
+It also supports caching the results to a local directory, so that you can run the load_data() method multiple times without hitting the API.
+
+## getting a token
+
+Visit: https://stackoverflowteams.com/users/pats/
+
+1. Click Create a new PAT
+3. Name the token, and pick the team scope
+4. Select an expiration date
+5. Click Create
+
+Add this to your env, or to the instantiation of the `StackoverflowReader(pa_token, team_name, cache_dir='./stackoverflow_cache')`
+
+```bash
+export STACKOVERFLOW_PAT=your_token
+export STACKOVERFLOW_TEAM_NAME=your_team
+```
+
+
+
+Other features which could be added:
+
+ - Add articles
+ - Add comments
+ - Add tags
+ - Add users
+ - Add votes
+ - Add badges
--- a/loader_hub/stackoverflow/init.py
+++ b/loader_hub/stackoverflow/init.py
@ -0,0 +1 @@
+"""Init file."""
--- a/loader_hub/stackoverflow/base.py
+++ b/loader_hub/stackoverflow/base.py
@ -0,0 +1,153 @@
+import json
+import logging
+import os
+import threading
+import time
+from dataclasses import dataclass
+from datetime import datetime
+from functools import wraps
+from typing import List, Optional
+
+import requests
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class StackOverflowPost:
+    link: str
+    score: int
+    last_activity_date: int
+    creation_date: int
+    post_id: Optional[int] = None
+    post_type: Optional[str] = None
+    body_markdown: Optional[str] = None
+    owner_account_id: Optional[int] = None
+    owner_reputation: Optional[int] = None
+    owner_user_id: Optional[int] = None
+    owner_user_type: Optional[str] = None
+    owner_profile_image: Optional[str] = None
+    owner_display_name: Optional[str] = None
+    owner_link: Optional[str] = None
+    title:  Optional[str] = None
+    last_edit_date:  Optional[str] = None
+    tags: Optional[List[str]] = None
+    view_count: Optional[int] = None
+    article_id: Optional[int] = None
+    article_type: Optional[str] = None
+
+def rate_limit(*, allowed_per_second: int):
+    max_period = 1.0 / allowed_per_second
+    last_call = [time.perf_counter()]
+    lock = threading.Lock()
+
+    def decorate(func):
+        @wraps(func)
+        def limit(*args, **kwargs):
+            with lock:
+                elapsed = time.perf_counter() - last_call[0]
+                hold = max_period - elapsed
+                if hold > 0:
+                    time.sleep(hold)
+                result = func(*args, **kwargs)
+                last_call[0] = time.perf_counter()
+            return result
+        return limit
+    return decorate
+
+@rate_limit(allowed_per_second=15)
+def rate_limited_get(url, headers):
+    '''
+    https://api.stackoverflowteams.com/docs/throttle
+    https://api.stackexchange.com/docs/throttle
+    Every application is subject to an IP based concurrent request throttle.
+    If a single IP is making more than 30 requests a second, new requests will be dropped.
+    The exact ban period is subject to change, but will be on the order of 30 seconds to a few minutes typically.
+    Note that exactly what response an application gets (in terms of HTTP code, text, and so on)
+    is undefined when subject to this ban; we consider > 30 request/sec per IP to be very abusive and thus cut the requests off very harshly.
+    '''
+    resp = requests.get(url, headers=headers)
+    if resp.status_code == 429:
+        logger.warning('Rate limited, sleeping for 5 minutes')
+        time.sleep(300)
+        return rate_limited_get(url, headers)
+    return resp
+
+
+class StackoverflowReader(BaseReader):
+
+    def __init__(self, api_key: str = None, team_name: str = None, cache_dir: str = None) -> None:
+        self._api_key = api_key or os.environ.get('STACKOVERFLOW_PAT')
+        self._team_name = team_name or os.environ.get('STACKOVERFLOW_TEAM_NAME')
+        self._last_index_time = None # TODO
+        self._cache_dir = cache_dir
+        if self._cache_dir:
+            os.makedirs(self._cache_dir, exist_ok=True)
+
+    def load_data(self, page: int = 1, doc_type: str = 'posts', limit: int = 50) -> List[Document]:
+        data = []
+        has_more = True
+
+        while has_more:
+            url = self.build_url(page, doc_type)
+            headers = {'X-API-Access-Token': self._api_key}
+            fp = os.path.join(self._cache_dir, f'{doc_type}_{page}.json')
+            response = {}
+            if self._cache_dir and os.path.exists(fp) and os.path.getsize(fp) > 0:
+                try:
+                    with open(fp, 'r') as f:
+                        response = f.read()
+                        response = json.loads(response)
+                except Exception as e:
+                    logger.error(e)
+            if not response:
+                response = rate_limited_get(url, headers)
+                response.raise_for_status()
+                if self._cache_dir:
+                    with open(os.path.join(self._cache_dir, f'{doc_type}_{page}.json'), 'w') as f:
+                        f.write(response.content.decode('utf-8'))
+                    logger.info(f'Wrote {fp} to cache')
+                response = response.json()
+            has_more = response['has_more']
+            items = response['items']
+            logger.info(f'Fetched {len(items)} {doc_type} from Stack Overflow')
+
+            for item_dict in items:
+                owner_fields = {}
+                if 'owner' in item_dict:
+                    owner_fields = {f"owner_{k}": v for k, v in item_dict.pop('owner').items()}
+                if 'title' not in item_dict:
+                    item_dict['title'] = item_dict['link']
+                post = StackOverflowPost(**item_dict, **owner_fields)
+                # TODO: filter out old posts
+                # last_modified = datetime.fromtimestamp(post.last_edit_date or post.last_activity_date)
+                # if last_modified < self._last_index_time:
+                #     return data
+
+                post_document = Document(text=post.body_markdown, doc_id=post.post_id,
+                                         extra_info={"title": post.title, "author": post.owner_display_name,
+                                                     "timestamp": datetime.fromtimestamp(post.creation_date), "location": post.link,
+                                                     "url": post.link, "author_image_url": post.owner_profile_image,
+                                                     "type": post.post_type})
+                data.append(post_document)
+
+            if has_more:
+                page += 1
+
+        return data
+
+    def build_url(self, page: int, doc_type: str) -> str:
+        team_fragment = f'&team={self._team_name}'
+        # not sure if this filter is shared globally, or only to a particular team
+        filter_fragment = '&filter=!nOedRLbqzB'
+        page_fragment = f'&page={page}'
+        url = f'https://api.stackoverflowteams.com/2.3/{doc_type}?{team_fragment}{filter_fragment}{page_fragment}'
+        return url
+
+
+
+if __name__ == "__main__":
+    reader = StackoverflowReader(os.environ.get('STACKOVERFLOW_PAT'), os.environ.get('STACKOVERFLOW_TEAM_NAME'), cache_dir='./stackoverflow_cache')
+    # reader.load_data()
--- a/loader_hub/stackoverflow/requirements.txt
+++ b/loader_hub/stackoverflow/requirements.txt
@ -0,0 +1,2 @@
+requests
+llama_index