Fix a reliability issue

This commit is contained in:
Jake Poznanski 2024-11-18 09:03:24 -08:00
parent 0af29f1f44
commit 96984fcd77
3 changed files with 18 additions and 3 deletions

View File

@ -298,7 +298,12 @@ async def process_page(args, session: aiohttp.ClientSession, worker_id: int, pdf
try:
async with session.post(COMPLETION_URL, json=query) as response:
response.raise_for_status()
if response.status == 400:
error_text = await response.text()
raise ValueError(f"Got BadRequestError from server: {error_text}, skipping this response")
else:
response.raise_for_status()
base_response_data = await response.json()
if base_response_data["usage"]["total_tokens"] > args.model_max_context:
@ -872,9 +877,9 @@ if __name__ == "__main__":
# - Refactor the work queue into its own file so it's reusable and generic, and it makes temporary work files (prevent issue where if a work item is done, then it stalls because queue was just emptied)
# X Fix the queue release mechanism so that it just does a timeout, based on zero queue size only, so you don't block things
# - Add logging of failed pages and have the stats function read them
# - Add the page rotation check and mechanism
# X Add the page rotation check and mechanism
# - Sglang commit a fix for the context length issue
# - Get a solid benchmark on the stream vs non stream approach
# - sglang error on s3://ai2-s2-pdfs/73ee/35e7ed5c2fb113ceba652284aaa51db7c2fc.pdf-2
# - Client error on attempt 0 for s3://ai2-s2-pdfs/e13c/9e03ce463ba53bfb15b26dbfd55c0bbc5568.pdf-1: 400, message='Bad Request',
# X Client error on attempt 0 for s3://ai2-s2-pdfs/e13c/9e03ce463ba53bfb15b26dbfd55c0bbc5568.pdf-1: 400, message='Bad Request',
# - Fix loading of the model checkpoints, it's so flakey now, maybe use datasets

BIN
tests/gnarly_pdfs/map1.pdf Normal file

Binary file not shown.

View File

@ -143,6 +143,16 @@ class AnchorTest(unittest.TestCase):
self.assertEqual(anchor_text.strip(), "Page dimensions: 612.0x792.0")
# TODO This one still fails
def testExcessiveMapAnchor(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "map1.pdf")
anchor_text = get_anchor_text(local_pdf_path, 1, pdf_engine="pdfreport", target_length=6000)
print(anchor_text)
print(len(anchor_text))
self.assertLess(len(anchor_text), 4000)
class BuildSilverTest(unittest.TestCase):
def testSmallPage(self):
local_pdf_path = os.path.join(os.path.dirname(__file__), "gnarly_pdfs", "small_page_size.pdf")