mirror of
https://github.com/mendableai/firecrawl.git
synced 2026-01-17 18:16:33 +00:00
140 lines
4.7 KiB
TypeScript
140 lines
4.7 KiB
TypeScript
import axios, { AxiosResponse } from "axios";
|
|
import fs from "fs";
|
|
import { createReadStream, createWriteStream } from "node:fs";
|
|
import FormData from "form-data";
|
|
import dotenv from "dotenv";
|
|
import pdf from "pdf-parse";
|
|
import path from "path";
|
|
import os from "os";
|
|
import { axiosTimeout } from "../../../lib/timeout";
|
|
import { Logger } from "../../../lib/logger";
|
|
|
|
dotenv.config();
|
|
|
|
export async function fetchAndProcessPdf(url: string, parsePDF: boolean): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
|
|
try {
|
|
const { tempFilePath, pageStatusCode, pageError } = await downloadPdf(url);
|
|
const content = await processPdfToText(tempFilePath, parsePDF);
|
|
fs.unlinkSync(tempFilePath); // Clean up the temporary file
|
|
return { content, pageStatusCode, pageError };
|
|
} catch (error) {
|
|
Logger.error(`Failed to fetch and process PDF: ${error.message}`);
|
|
return { content: "", pageStatusCode: 500, pageError: error.message };
|
|
}
|
|
}
|
|
|
|
async function downloadPdf(url: string): Promise<{ tempFilePath: string, pageStatusCode?: number, pageError?: string }> {
|
|
const response = await axios({
|
|
url,
|
|
method: "GET",
|
|
responseType: "stream",
|
|
});
|
|
|
|
const tempFilePath = path.join(os.tmpdir(), `tempPdf-${Date.now()}.pdf`);
|
|
const writer = createWriteStream(tempFilePath);
|
|
|
|
response.data.pipe(writer);
|
|
|
|
return new Promise((resolve, reject) => {
|
|
writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }));
|
|
writer.on("error", reject);
|
|
});
|
|
}
|
|
|
|
export async function processPdfToText(filePath: string, parsePDF: boolean): Promise<string> {
|
|
let content = "";
|
|
|
|
if (process.env.LLAMAPARSE_API_KEY && parsePDF) {
|
|
Logger.debug("Processing pdf document w/ LlamaIndex");
|
|
const apiKey = process.env.LLAMAPARSE_API_KEY;
|
|
const headers = {
|
|
Authorization: `Bearer ${apiKey}`,
|
|
};
|
|
const base_url = "https://api.cloud.llamaindex.ai/api/parsing";
|
|
const fileType2 = "application/pdf";
|
|
|
|
try {
|
|
const formData = new FormData();
|
|
formData.append("file", createReadStream(filePath), {
|
|
filename: filePath,
|
|
contentType: fileType2,
|
|
});
|
|
|
|
const uploadUrl = `${base_url}/upload`;
|
|
const uploadResponse = await axios.post(uploadUrl, formData, {
|
|
headers: {
|
|
...headers,
|
|
...formData.getHeaders(),
|
|
},
|
|
});
|
|
|
|
const jobId = uploadResponse.data.id;
|
|
const resultType = "text";
|
|
const resultUrl = `${base_url}/job/${jobId}/result/${resultType}`;
|
|
|
|
let resultResponse: AxiosResponse;
|
|
let attempt = 0;
|
|
const maxAttempts = 10; // Maximum number of attempts
|
|
let resultAvailable = false;
|
|
while (attempt < maxAttempts && !resultAvailable) {
|
|
try {
|
|
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
|
|
if (resultResponse.status === 200) {
|
|
resultAvailable = true; // Exit condition met
|
|
} else {
|
|
// If the status code is not 200, increment the attempt counter and wait
|
|
attempt++;
|
|
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds
|
|
}
|
|
} catch (error) {
|
|
Logger.debug("Error fetching result w/ LlamaIndex");
|
|
attempt++;
|
|
if (attempt >= maxAttempts) {
|
|
Logger.error("Max attempts reached, unable to fetch result.");
|
|
break; // Exit the loop if max attempts are reached
|
|
}
|
|
await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying
|
|
// You may want to handle specific errors differently
|
|
}
|
|
}
|
|
|
|
if (!resultAvailable) {
|
|
try {
|
|
content = await processPdf(filePath);
|
|
} catch (error) {
|
|
Logger.error(`Failed to process PDF: ${error}`);
|
|
content = "";
|
|
}
|
|
}
|
|
content = resultResponse.data[resultType];
|
|
} catch (error) {
|
|
Logger.debug("Error processing pdf document w/ LlamaIndex(2)");
|
|
content = await processPdf(filePath);
|
|
}
|
|
} else if (parsePDF) {
|
|
try {
|
|
content = await processPdf(filePath);
|
|
} catch (error) {
|
|
Logger.error(`Failed to process PDF: ${error}`);
|
|
content = "";
|
|
}
|
|
} else {
|
|
try {
|
|
content = fs.readFileSync(filePath, "utf-8");
|
|
} catch (error) {
|
|
Logger.error(`Failed to read PDF file: ${error}`);
|
|
content = "";
|
|
}
|
|
}
|
|
return content;
|
|
}
|
|
|
|
async function processPdf(file: string) {
|
|
try {
|
|
const fileContent = fs.readFileSync(file);
|
|
const data = await pdf(fileContent);
|
|
return data.text;
|
|
} catch (error) {
|
|
throw error;
|
|
}
|
|
} |