mirror of
https://github.com/run-llama/llama-hub.git
synced 2025-12-05 19:40:07 +00:00
Beta version for stackoverflow teams connections (#188)
Co-authored-by: Jerry Liu <jerryjliu98@gmail.com>
This commit is contained in:
parent
9a98b46413
commit
f64be68bcc
@ -376,6 +376,102 @@
|
||||
"id": "airtable",
|
||||
"author": "smyja"
|
||||
},
|
||||
"HatenaBlogReader": {
|
||||
"id": "hatena_blog",
|
||||
"author": "Shoya SHIRAKI",
|
||||
"keywords": [
|
||||
"hatena",
|
||||
"blog"
|
||||
]
|
||||
},
|
||||
"OpendalReader": {
|
||||
"id": "opendal_reader",
|
||||
"author": "OpenDAL Contributors",
|
||||
"keywords": [
|
||||
"storage"
|
||||
]
|
||||
},
|
||||
"OpendalS3Reader": {
|
||||
"id": "opendal_reader/s3",
|
||||
"author": "OpenDAL Contributors",
|
||||
"keywords": [
|
||||
"storage",
|
||||
"s3"
|
||||
]
|
||||
},
|
||||
"OpendalAzblobReader": {
|
||||
"id": "opendal_reader/azblob",
|
||||
"author": "OpenDAL Contributors",
|
||||
"keywords": [
|
||||
"storage",
|
||||
"azblob"
|
||||
]
|
||||
},
|
||||
"OpendalGcsReader": {
|
||||
"id": "opendal_reader/gcs",
|
||||
"author": "OpenDAL Contributors",
|
||||
"keywords": [
|
||||
"storage",
|
||||
"gcs"
|
||||
]
|
||||
},
|
||||
"ConfluenceReader": {
|
||||
"id": "confluence",
|
||||
"author": "zywilliamli"
|
||||
},
|
||||
"ChatGPTRetrievalPluginReader": {
|
||||
"id": "chatgpt_plugin",
|
||||
"author": "jerryjliu"
|
||||
},
|
||||
"JiraReader": {
|
||||
"id": "jira",
|
||||
"author": "bearguy",
|
||||
"keywords": [
|
||||
"jira"
|
||||
]
|
||||
},
|
||||
"UnstructuredURLLoader": {
|
||||
"id": "web/unstructured_web",
|
||||
"author": "kravetsmic",
|
||||
"keywords": [
|
||||
"unstructured.io",
|
||||
"url"
|
||||
]
|
||||
},
|
||||
"GoogleSheetsReader": {
|
||||
"id": "google_sheets",
|
||||
"author": "piroz"
|
||||
},
|
||||
"FeedlyRssReader": {
|
||||
"id": "feedly_rss",
|
||||
"author": "kychanbp",
|
||||
"keywords": [
|
||||
"feedly",
|
||||
"rss"
|
||||
]
|
||||
},
|
||||
"FlatPdfReader": {
|
||||
"id": "file/flat_pdf",
|
||||
"author": "emmanuel-oliveira",
|
||||
"keywords": [
|
||||
"pdf",
|
||||
"flat",
|
||||
"flattened"
|
||||
]
|
||||
},
|
||||
"MilvusReader": {
|
||||
"id": "milvus",
|
||||
"author": "filip-halt"
|
||||
},
|
||||
"StackoverflowReader": {
|
||||
"id": "stackoverflow",
|
||||
"author": "allen-munsch",
|
||||
"keywords": [
|
||||
"posts",
|
||||
"questions",
|
||||
"answers"
|
||||
]
|
||||
},
|
||||
"ZulipReader": {
|
||||
"id": "zulip",
|
||||
"author": "plurigrid"
|
||||
|
||||
32
loader_hub/stackoverflow/README.md
Normal file
32
loader_hub/stackoverflow/README.md
Normal file
@ -0,0 +1,32 @@
|
||||
# StackoverflowReader (In Beta)
|
||||
|
||||
Using the Stackoverflow API, this class will read the Stackoverflow Teams API and return a list of questions and answers based on posts.
|
||||
|
||||
It also supports caching the results to a local directory, so that you can run the load_data() method multiple times without hitting the API.
|
||||
|
||||
## getting a token
|
||||
|
||||
Visit: https://stackoverflowteams.com/users/pats/
|
||||
|
||||
1. Click Create a new PAT
|
||||
3. Name the token, and pick the team scope
|
||||
4. Select an expiration date
|
||||
5. Click Create
|
||||
|
||||
Add this to your env, or to the instantiation of the `StackoverflowReader(pa_token, team_name, cache_dir='./stackoverflow_cache')`
|
||||
|
||||
```bash
|
||||
export STACKOVERFLOW_PAT=your_token
|
||||
export STACKOVERFLOW_TEAM_NAME=your_team
|
||||
```
|
||||
|
||||
|
||||
|
||||
Other features which could be added:
|
||||
|
||||
- Add articles
|
||||
- Add comments
|
||||
- Add tags
|
||||
- Add users
|
||||
- Add votes
|
||||
- Add badges
|
||||
1
loader_hub/stackoverflow/__init__.py
Normal file
1
loader_hub/stackoverflow/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
"""Init file."""
|
||||
153
loader_hub/stackoverflow/base.py
Normal file
153
loader_hub/stackoverflow/base.py
Normal file
@ -0,0 +1,153 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from functools import wraps
|
||||
from typing import List, Optional
|
||||
|
||||
import requests
|
||||
from llama_index.readers.base import BaseReader
|
||||
from llama_index.readers.schema.base import Document
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class StackOverflowPost:
|
||||
link: str
|
||||
score: int
|
||||
last_activity_date: int
|
||||
creation_date: int
|
||||
post_id: Optional[int] = None
|
||||
post_type: Optional[str] = None
|
||||
body_markdown: Optional[str] = None
|
||||
owner_account_id: Optional[int] = None
|
||||
owner_reputation: Optional[int] = None
|
||||
owner_user_id: Optional[int] = None
|
||||
owner_user_type: Optional[str] = None
|
||||
owner_profile_image: Optional[str] = None
|
||||
owner_display_name: Optional[str] = None
|
||||
owner_link: Optional[str] = None
|
||||
title: Optional[str] = None
|
||||
last_edit_date: Optional[str] = None
|
||||
tags: Optional[List[str]] = None
|
||||
view_count: Optional[int] = None
|
||||
article_id: Optional[int] = None
|
||||
article_type: Optional[str] = None
|
||||
|
||||
def rate_limit(*, allowed_per_second: int):
|
||||
max_period = 1.0 / allowed_per_second
|
||||
last_call = [time.perf_counter()]
|
||||
lock = threading.Lock()
|
||||
|
||||
def decorate(func):
|
||||
@wraps(func)
|
||||
def limit(*args, **kwargs):
|
||||
with lock:
|
||||
elapsed = time.perf_counter() - last_call[0]
|
||||
hold = max_period - elapsed
|
||||
if hold > 0:
|
||||
time.sleep(hold)
|
||||
result = func(*args, **kwargs)
|
||||
last_call[0] = time.perf_counter()
|
||||
return result
|
||||
return limit
|
||||
return decorate
|
||||
|
||||
@rate_limit(allowed_per_second=15)
|
||||
def rate_limited_get(url, headers):
|
||||
'''
|
||||
https://api.stackoverflowteams.com/docs/throttle
|
||||
https://api.stackexchange.com/docs/throttle
|
||||
Every application is subject to an IP based concurrent request throttle.
|
||||
If a single IP is making more than 30 requests a second, new requests will be dropped.
|
||||
The exact ban period is subject to change, but will be on the order of 30 seconds to a few minutes typically.
|
||||
Note that exactly what response an application gets (in terms of HTTP code, text, and so on)
|
||||
is undefined when subject to this ban; we consider > 30 request/sec per IP to be very abusive and thus cut the requests off very harshly.
|
||||
'''
|
||||
resp = requests.get(url, headers=headers)
|
||||
if resp.status_code == 429:
|
||||
logger.warning('Rate limited, sleeping for 5 minutes')
|
||||
time.sleep(300)
|
||||
return rate_limited_get(url, headers)
|
||||
return resp
|
||||
|
||||
|
||||
class StackoverflowReader(BaseReader):
|
||||
|
||||
def __init__(self, api_key: str = None, team_name: str = None, cache_dir: str = None) -> None:
|
||||
self._api_key = api_key or os.environ.get('STACKOVERFLOW_PAT')
|
||||
self._team_name = team_name or os.environ.get('STACKOVERFLOW_TEAM_NAME')
|
||||
self._last_index_time = None # TODO
|
||||
self._cache_dir = cache_dir
|
||||
if self._cache_dir:
|
||||
os.makedirs(self._cache_dir, exist_ok=True)
|
||||
|
||||
def load_data(self, page: int = 1, doc_type: str = 'posts', limit: int = 50) -> List[Document]:
|
||||
data = []
|
||||
has_more = True
|
||||
|
||||
while has_more:
|
||||
url = self.build_url(page, doc_type)
|
||||
headers = {'X-API-Access-Token': self._api_key}
|
||||
fp = os.path.join(self._cache_dir, f'{doc_type}_{page}.json')
|
||||
response = {}
|
||||
if self._cache_dir and os.path.exists(fp) and os.path.getsize(fp) > 0:
|
||||
try:
|
||||
with open(fp, 'r') as f:
|
||||
response = f.read()
|
||||
response = json.loads(response)
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
if not response:
|
||||
response = rate_limited_get(url, headers)
|
||||
response.raise_for_status()
|
||||
if self._cache_dir:
|
||||
with open(os.path.join(self._cache_dir, f'{doc_type}_{page}.json'), 'w') as f:
|
||||
f.write(response.content.decode('utf-8'))
|
||||
logger.info(f'Wrote {fp} to cache')
|
||||
response = response.json()
|
||||
has_more = response['has_more']
|
||||
items = response['items']
|
||||
logger.info(f'Fetched {len(items)} {doc_type} from Stack Overflow')
|
||||
|
||||
for item_dict in items:
|
||||
owner_fields = {}
|
||||
if 'owner' in item_dict:
|
||||
owner_fields = {f"owner_{k}": v for k, v in item_dict.pop('owner').items()}
|
||||
if 'title' not in item_dict:
|
||||
item_dict['title'] = item_dict['link']
|
||||
post = StackOverflowPost(**item_dict, **owner_fields)
|
||||
# TODO: filter out old posts
|
||||
# last_modified = datetime.fromtimestamp(post.last_edit_date or post.last_activity_date)
|
||||
# if last_modified < self._last_index_time:
|
||||
# return data
|
||||
|
||||
post_document = Document(text=post.body_markdown, doc_id=post.post_id,
|
||||
extra_info={"title": post.title, "author": post.owner_display_name,
|
||||
"timestamp": datetime.fromtimestamp(post.creation_date), "location": post.link,
|
||||
"url": post.link, "author_image_url": post.owner_profile_image,
|
||||
"type": post.post_type})
|
||||
data.append(post_document)
|
||||
|
||||
if has_more:
|
||||
page += 1
|
||||
|
||||
return data
|
||||
|
||||
def build_url(self, page: int, doc_type: str) -> str:
|
||||
team_fragment = f'&team={self._team_name}'
|
||||
# not sure if this filter is shared globally, or only to a particular team
|
||||
filter_fragment = '&filter=!nOedRLbqzB'
|
||||
page_fragment = f'&page={page}'
|
||||
url = f'https://api.stackoverflowteams.com/2.3/{doc_type}?{team_fragment}{filter_fragment}{page_fragment}'
|
||||
return url
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
reader = StackoverflowReader(os.environ.get('STACKOVERFLOW_PAT'), os.environ.get('STACKOVERFLOW_TEAM_NAME'), cache_dir='./stackoverflow_cache')
|
||||
# reader.load_data()
|
||||
2
loader_hub/stackoverflow/requirements.txt
Normal file
2
loader_hub/stackoverflow/requirements.txt
Normal file
@ -0,0 +1,2 @@
|
||||
requests
|
||||
llama_index
|
||||
Loading…
x
Reference in New Issue
Block a user