David E. Weekly 1895495200
Add BoardDocs Support (#257)
Co-authored-by: Jerry Liu <jerryjliu98@gmail.com>
2023-05-17 09:16:14 -07:00

126 lines
4.3 KiB
Python

"""Reader that pulls in a BoardDocs site."""
from typing import Any, List, Optional
from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
import json
import requests
class BoardDocsReader(BaseReader):
"""BoardDocs doc reader.
Read public agendas included on a BoardDocs site.
Args:
site (str): The BoardDocs site you'd like to index, e.g. "ca/redwood"
committee_id (str): The committee on the site you want to index
"""
def __init__(
self,
site:str,
committee_id:str,
) -> None:
"""Initialize with parameters."""
self.site = site
self.committee_id = committee_id
self.base_url = "https://go.boarddocs.com/" + site + "/Board.nsf"
# set up the headers required for the server to answer
self.headers = {
"accept": "application/json, text/javascript, */*; q=0.01",
"accept-language": "en-US,en;q=0.9",
"content-type": "application/x-www-form-urlencoded; charset=UTF-8",
"sec-ch-ua": "\"Google Chrome\";v=\"113\", \"Chromium\";v=\"113\", \"Not-A.Brand\";v=\"24\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\"",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"x-requested-with": "XMLHttpRequest"
}
super().__init__()
def get_meeting_list(self) -> List[dict]:
"""
Returns a list of meetings for the committee
Args:
None
Returns:
List[dict]: A list of meetings, each with a meetingID, date, and unid
"""
meeting_list_url = self.base_url + "/BD-GetMeetingsList?open"
data = "current_committee_id=" + self.committee_id
response = requests.post(meeting_list_url, headers=self.headers, data=data)
meetingsData = json.loads(response.text)
meetings = [{"meetingID": meeting.get('unique', None),
"date": meeting.get('numberdate', None),
"unid": meeting.get('unid', None)} for meeting in meetingsData]
return meetings
def process_meeting(self,
meeting_id:str,
index_pdfs:bool = True) -> List[Document]:
"""
Returns documents from the given meeting
"""
agenda_url = self.base_url + "/PRINT-AgendaDetailed"
# set the meetingID & committee
data = "id=" + meeting_id + "&" + "current_committee_id=" + self.committee_id
# POST the request!
response = requests.post(agenda_url, headers=self.headers, data=data)
import html2text
from bs4 import BeautifulSoup
# parse the returned HTML
soup = BeautifulSoup(response.content, "html.parser")
agenda_date = soup.find("div", {"class":"print-meeting-date"}).string
agenda_title = soup.find("div", {"class":"print-meeting-name"}).string
agenda_files = [fd.a.get('href') for fd in soup.find_all("div", {"class":"public-file"})]
agenda_data = html2text.html2text(response.text)
# TODO: index the linked PDFs in agenda_files!
docs = []
agenda_doc = Document(text=agenda_data,
doc_id=meeting_id,
extra_info={
"committee": self.committee_id,
"title": agenda_title,
"date": agenda_date,
"url": agenda_url,
})
docs.append(agenda_doc)
return docs
def load_data(
self,
meeting_ids: Optional[List[str]] = None,
**load_kwargs: Any
) -> List[Document]:
"""Load all meetings of the committee.
Args:
meeting_ids (List[str]): A list of meeting IDs to load. If None, load all meetings.
"""
# if a list of meetings wasn't provided, enumerate them all
if not meeting_ids:
meeting_ids = [meeting.get('meetingID') for meeting in self.get_meeting_list()]
# process all relevant meetings & return the documents
docs = []
for meeting_id in meeting_ids:
docs.extend(self.process_meeting(meeting_id))
return docs