mirror of
				https://github.com/mendableai/firecrawl.git
				synced 2025-10-31 01:54:18 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			153 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			153 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| from firecrawl import FirecrawlApp
 | |
| import json
 | |
| from dotenv import load_dotenv
 | |
| from openai import OpenAI
 | |
| 
 | |
| # ANSI color codes
 | |
| class Colors:
 | |
|     CYAN = '\033[96m'
 | |
|     YELLOW = '\033[93m'
 | |
|     GREEN = '\033[92m'
 | |
|     RED = '\033[91m'
 | |
|     MAGENTA = '\033[95m'
 | |
|     BLUE = '\033[94m'
 | |
|     RESET = '\033[0m'
 | |
| 
 | |
| # Load environment variables
 | |
| load_dotenv()
 | |
| 
 | |
| # Retrieve API keys from environment variables
 | |
| firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY")
 | |
| openai_api_key = os.getenv("OPENAI_API_KEY")
 | |
| 
 | |
| # Initialize the FirecrawlApp and OpenAI client
 | |
| app = FirecrawlApp(api_key=firecrawl_api_key)
 | |
| client = OpenAI(api_key=openai_api_key)
 | |
| 
 | |
| # Find the page that most likely contains the objective
 | |
| def find_relevant_page_via_map(objective, url, app, client):
 | |
|     try:
 | |
|         print(f"{Colors.CYAN}Understood. The objective is: {objective}{Colors.RESET}")
 | |
|         print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}")
 | |
|         
 | |
|         map_prompt = f"""
 | |
|         The map function generates a list of URLs from a website and it accepts a search parameter. Based on the objective of: {objective}, come up with a 1-2 word search parameter that will help us find the information we need. Only respond with 1-2 words nothing else.
 | |
|         """
 | |
| 
 | |
|         print(f"{Colors.YELLOW}Analyzing objective to determine optimal search parameter...{Colors.RESET}")
 | |
|         completion = client.chat.completions.create(
 | |
|             model="o1-preview",
 | |
|             messages=[
 | |
|                 {
 | |
|                     "role": "user",
 | |
|                     "content": [
 | |
|                         {
 | |
|                             "type": "text",
 | |
|                             "text": map_prompt
 | |
|                         }
 | |
|                     ]
 | |
|                 }
 | |
|             ]
 | |
|         )
 | |
| 
 | |
|         map_search_parameter = completion.choices[0].message.content
 | |
|         print(f"{Colors.GREEN}Optimal search parameter identified: {map_search_parameter}{Colors.RESET}")
 | |
| 
 | |
|         print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}")
 | |
|         map_website = app.map_url(url, params={"search": map_search_parameter})
 | |
|         print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}")
 | |
|         print(f"{Colors.GREEN}Located {len(map_website)} relevant links.{Colors.RESET}")
 | |
|         return map_website
 | |
|     except Exception as e:
 | |
|         print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}")
 | |
|         return None
 | |
|     
 | |
| # Scrape the top 3 pages and see if the objective is met, if so return in json format else return None
 | |
| def find_objective_in_top_pages(map_website, objective, app, client):
 | |
|     try:
 | |
|         # Get top 3 links from the map result
 | |
|         top_links = map_website[:3] if isinstance(map_website, list) else []
 | |
|         print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}")
 | |
|         
 | |
|         for link in top_links:
 | |
|             print(f"{Colors.YELLOW}Initiating scrape of page: {link}{Colors.RESET}")
 | |
|             # Scrape the page
 | |
|             scrape_result = app.scrape_url(link, params={'formats': ['markdown']})
 | |
|             print(f"{Colors.GREEN}Page scraping completed successfully.{Colors.RESET}")
 | |
|      
 | |
|             
 | |
|             # Check if objective is met
 | |
|             check_prompt = f"""
 | |
|             Given the following scraped content and objective, determine if the objective is met.
 | |
|             If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.
 | |
|             If the objective is not met with confidence, respond with 'Objective not met'.
 | |
| 
 | |
|             Objective: {objective}
 | |
|             Scraped content: {scrape_result['markdown']}
 | |
| 
 | |
|             Remember:
 | |
|             1. Only return JSON if you are confident the objective is fully met.
 | |
|             2. Keep the JSON structure as simple and flat as possible.
 | |
|             3. Do not include any explanations or markdown formatting in your response.
 | |
|             """
 | |
|         
 | |
|             completion = client.chat.completions.create(
 | |
|             model="o1-preview",
 | |
|             messages=[
 | |
|                 {
 | |
|                     "role": "user",
 | |
|                     "content": [
 | |
|                         {
 | |
|                             "type": "text",
 | |
|                             "text": check_prompt
 | |
|                         }
 | |
|                     ]
 | |
|                 }
 | |
|                 ]
 | |
|             )
 | |
|             
 | |
|             result = completion.choices[0].message.content
 | |
|             
 | |
|             if result != "Objective not met":
 | |
|                 print(f"{Colors.GREEN}Objective potentially fulfilled. Relevant information identified.{Colors.RESET}")
 | |
|                 try:
 | |
|                     return json.loads(result)
 | |
|                 except json.JSONDecodeError:
 | |
|                     print(f"{Colors.RED}Error in parsing response. Proceeding to next page...{Colors.RESET}")
 | |
|             else:
 | |
|                 print(f"{Colors.YELLOW}Objective not met on this page. Proceeding to next link...{Colors.RESET}")
 | |
|         
 | |
|         print(f"{Colors.RED}All available pages analyzed. Objective not fulfilled in examined content.{Colors.RESET}")
 | |
|         return None
 | |
|     
 | |
|     except Exception as e:
 | |
|         print(f"{Colors.RED}Error encountered during page analysis: {str(e)}{Colors.RESET}")
 | |
|         return None
 | |
| 
 | |
| # Main function to execute the process
 | |
| def main():
 | |
|     # Get user input
 | |
|     url = input(f"{Colors.BLUE}Enter the website to crawl: {Colors.RESET}")
 | |
|     objective = input(f"{Colors.BLUE}Enter your objective: {Colors.RESET}")
 | |
|     
 | |
|     print(f"{Colors.YELLOW}Initiating web crawling process...{Colors.RESET}")
 | |
|     # Find the relevant page
 | |
|     map_website = find_relevant_page_via_map(objective, url, app, client)
 | |
|     
 | |
|     if map_website:
 | |
|         print(f"{Colors.GREEN}Relevant pages identified. Proceeding with detailed analysis...{Colors.RESET}")
 | |
|         # Find objective in top pages
 | |
|         result = find_objective_in_top_pages(map_website, objective, app, client)
 | |
|         
 | |
|         if result:
 | |
|             print(f"{Colors.GREEN}Objective successfully fulfilled. Extracted information:{Colors.RESET}")
 | |
|             print(f"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}")
 | |
|         else:
 | |
|             print(f"{Colors.RED}Unable to fulfill the objective with the available content.{Colors.RESET}")
 | |
|     else:
 | |
|         print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|     main()
 | 
