fix(crawler): surface real redirect status codes and keep redirect chain. the 30x response instead of always returning 200. Refs #660

This commit is contained in:
ntohidi 2025-04-30 12:29:17 +02:00
parent 039be1b1ce
commit 1d6a2b9979

View File

@ -744,12 +744,33 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"after_goto", page, context=context, url=url, response=response, config=config
)
# ──────────────────────────────────────────────────────────────
# Walk the redirect chain. Playwright returns only the last
# hop, so we trace the `request.redirected_from` links until the
# first response that differs from the final one and surface its
# status-code.
# ──────────────────────────────────────────────────────────────
if response is None:
status_code = 200
response_headers = {}
else:
status_code = response.status
response_headers = response.headers
first_resp = response
req = response.request
while req and req.redirected_from:
prev_req = req.redirected_from
prev_resp = await prev_req.response()
if prev_resp: # keep earliest
first_resp = prev_resp
req = prev_req
status_code = first_resp.status
response_headers = first_resp.headers
# if response is None:
# status_code = 200
# response_headers = {}
# else:
# status_code = response.status
# response_headers = response.headers
else:
status_code = 200