"""Reader that pulls in a BoardDocs site.""" from typing import Any, List, Optional from llama_index.readers.base import BaseReader from llama_index.readers.schema.base import Document import json import requests class BoardDocsReader(BaseReader): """BoardDocs doc reader. Read public agendas included on a BoardDocs site. Args: site (str): The BoardDocs site you'd like to index, e.g. "ca/redwood" committee_id (str): The committee on the site you want to index """ def __init__( self, site:str, committee_id:str, ) -> None: """Initialize with parameters.""" self.site = site self.committee_id = committee_id self.base_url = "https://go.boarddocs.com/" + site + "/Board.nsf" # set up the headers required for the server to answer self.headers = { "accept": "application/json, text/javascript, */*; q=0.01", "accept-language": "en-US,en;q=0.9", "content-type": "application/x-www-form-urlencoded; charset=UTF-8", "sec-ch-ua": "\"Google Chrome\";v=\"113\", \"Chromium\";v=\"113\", \"Not-A.Brand\";v=\"24\"", "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": "\"macOS\"", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-origin", "x-requested-with": "XMLHttpRequest" } super().__init__() def get_meeting_list(self) -> List[dict]: """ Returns a list of meetings for the committee Args: None Returns: List[dict]: A list of meetings, each with a meetingID, date, and unid """ meeting_list_url = self.base_url + "/BD-GetMeetingsList?open" data = "current_committee_id=" + self.committee_id response = requests.post(meeting_list_url, headers=self.headers, data=data) meetingsData = json.loads(response.text) meetings = [{"meetingID": meeting.get('unique', None), "date": meeting.get('numberdate', None), "unid": meeting.get('unid', None)} for meeting in meetingsData] return meetings def process_meeting(self, meeting_id:str, index_pdfs:bool = True) -> List[Document]: """ Returns documents from the given meeting """ agenda_url = self.base_url + "/PRINT-AgendaDetailed" # set the meetingID & committee data = "id=" + meeting_id + "&" + "current_committee_id=" + self.committee_id # POST the request! response = requests.post(agenda_url, headers=self.headers, data=data) import html2text from bs4 import BeautifulSoup # parse the returned HTML soup = BeautifulSoup(response.content, "html.parser") agenda_date = soup.find("div", {"class":"print-meeting-date"}).string agenda_title = soup.find("div", {"class":"print-meeting-name"}).string agenda_files = [fd.a.get('href') for fd in soup.find_all("div", {"class":"public-file"})] agenda_data = html2text.html2text(response.text) # TODO: index the linked PDFs in agenda_files! docs = [] agenda_doc = Document(text=agenda_data, doc_id=meeting_id, extra_info={ "committee": self.committee_id, "title": agenda_title, "date": agenda_date, "url": agenda_url, }) docs.append(agenda_doc) return docs def load_data( self, meeting_ids: Optional[List[str]] = None, **load_kwargs: Any ) -> List[Document]: """Load all meetings of the committee. Args: meeting_ids (List[str]): A list of meeting IDs to load. If None, load all meetings. """ # if a list of meetings wasn't provided, enumerate them all if not meeting_ids: meeting_ids = [meeting.get('meetingID') for meeting in self.get_meeting_list()] # process all relevant meetings & return the documents docs = [] for meeting_id in meeting_ids: docs.extend(self.process_meeting(meeting_id)) return docs