Gergő Móricz 1ddd663020 fix: pkgvuln
2025-09-15 15:05:22 +02:00

97 lines
2.6 KiB
TypeScript

//@ts-ignore
import * as fs from 'fs'
import FirecrawlApp from '@mendable/firecrawl-js'
import 'dotenv/config'
import { config } from 'dotenv'
import { z } from 'zod'
config()
export async function scrapeAirbnb() {
try {
// Initialize the FirecrawlApp with your API key
const app = new FirecrawlApp({ apiKey: process.env.FIRECRAWL_API_KEY })
// Define the URL to crawl
const listingsUrl =
'https://www.airbnb.com/s/San-Francisco--CA--United-States/homes'
const baseUrl = 'https://www.airbnb.com'
// Define schema to extract pagination links
const paginationSchema = z.object({
page_links: z
.array(
z.object({
link: z.string(),
})
)
.describe('Pagination links in the bottom of the page.'),
})
// Start crawling to get pagination links
const linksData = await app.scrape(listingsUrl, {
onlyMainContent: false,
formats: [{
type: "json",
schema: paginationSchema,
}],
timeout: 50000, // if needed, sometimes airbnb stalls...
});
console.log(linksData.json)
let paginationLinks = linksData.json!.page_links.map(
(link) => baseUrl + link.link
)
// Just in case is not able to get the pagination links
if (paginationLinks.length === 0) {
paginationLinks = [listingsUrl]
}
// Define schema to extract listings
const schema = z.object({
listings: z
.array(
z.object({
title: z.string(),
price_per_night: z.number(),
location: z.string(),
rating: z.number().optional(),
reviews: z.number().optional(),
})
)
.describe('Airbnb listings in San Francisco'),
})
// Function to scrape a single URL
const scrapeListings = async (url) => {
const result = await app.scrape(url, {
onlyMainContent: false,
formats: [{
type: "json",
schema: schema,
}],
});
return result.json!.listings
}
// Scrape all pagination links in parallel
const listingsPromises = paginationLinks.map((link) => scrapeListings(link))
const listingsResults = await Promise.all(listingsPromises)
// Flatten the results
const allListings = listingsResults.flat()
// Save the listings to a file
fs.writeFileSync(
'airbnb_listings.json',
JSON.stringify(allListings, null, 2)
)
// Read the listings from the file
const listingsData = fs.readFileSync('airbnb_listings.json', 'utf8')
return listingsData
} catch (error) {
console.error('An error occurred:', error.message)
}
}