Remove some old files.

2024-11-08 19:08:58 +08:00 · 2024-11-08 19:08:58 +08:00 · bcdd80911f
commit bcdd80911f
parent b120965b6a
2 changed files with 0 additions and 503 deletions
--- a/crawl4ai/train.py
+++ b/crawl4ai/train.py
@ -1,146 +0,0 @@
-import spacy
-from spacy.training import Example
-import random
-import nltk
-from nltk.corpus import reuters
-import torch
-
-def save_spacy_model_as_torch(nlp, model_dir="models/reuters"):
-    # Extract the TextCategorizer component
-    textcat = nlp.get_pipe("textcat_multilabel")
-
-    # Convert the weights to a PyTorch state dictionary
-    state_dict = {name: torch.tensor(param.data) for name, param in textcat.model.named_parameters()}
-
-    # Save the state dictionary
-    torch.save(state_dict, f"{model_dir}/model_weights.pth")
-
-    # Extract and save the vocabulary
-    vocab = extract_vocab(nlp)
-    with open(f"{model_dir}/vocab.txt", "w") as vocab_file:
-        for word, idx in vocab.items():
-            vocab_file.write(f"{word}\t{idx}\n")
-    
-    print(f"Model weights and vocabulary saved to: {model_dir}")
-
-def extract_vocab(nlp):
-    # Extract vocabulary from the SpaCy model
-    vocab = {word: i for i, word in enumerate(nlp.vocab.strings)}
-    return vocab
-
-nlp = spacy.load("models/reuters")
-save_spacy_model_as_torch(nlp, model_dir="models")
-
-def train_and_save_reuters_model(model_dir="models/reuters"):
-    # Ensure the Reuters corpus is downloaded
-    nltk.download('reuters')
-    nltk.download('punkt')
-    if not reuters.fileids():
-        print("Reuters corpus not found.")
-        return
-
-    # Load a blank English spaCy model
-    nlp = spacy.blank("en")
-
-    # Create a TextCategorizer with the ensemble model for multi-label classification
-    textcat = nlp.add_pipe("textcat_multilabel")
-
-    # Add labels to text classifier
-    for label in reuters.categories():
-        textcat.add_label(label)
-
-    # Prepare training data
-    train_examples = []
-    for fileid in reuters.fileids():
-        categories = reuters.categories(fileid)
-        text = reuters.raw(fileid)
-        cats = {label: label in categories for label in reuters.categories()}
-        # Prepare spacy Example objects
-        doc = nlp.make_doc(text)
-        example = Example.from_dict(doc, {'cats': cats})
-        train_examples.append(example)
-
-    # Initialize the text categorizer with the example objects
-    nlp.initialize(lambda: train_examples)
-
-    # Train the model
-    random.seed(1)
-    spacy.util.fix_random_seed(1)
-    for i in range(5):  # Adjust iterations for better accuracy
-        random.shuffle(train_examples)
-        losses = {}
-        # Create batches of data
-        batches = spacy.util.minibatch(train_examples, size=8)
-        for batch in batches:
-            nlp.update(batch, drop=0.2, losses=losses)
-        print(f"Losses at iteration {i}: {losses}")
-
-    # Save the trained model
-    nlp.to_disk(model_dir)
-    print(f"Model saved to: {model_dir}")
-
-def train_model(model_dir, additional_epochs=0):
-    # Load the model if it exists, otherwise start with a blank model
-    try:
-        nlp = spacy.load(model_dir)
-        print("Model loaded from disk.")
-    except IOError:
-        print("No existing model found. Starting with a new model.")
-        nlp = spacy.blank("en")
-        textcat = nlp.add_pipe("textcat_multilabel")
-        for label in reuters.categories():
-            textcat.add_label(label)
-
-    # Prepare training data
-    train_examples = []
-    for fileid in reuters.fileids():
-        categories = reuters.categories(fileid)
-        text = reuters.raw(fileid)
-        cats = {label: label in categories for label in reuters.categories()}
-        doc = nlp.make_doc(text)
-        example = Example.from_dict(doc, {'cats': cats})
-        train_examples.append(example)
-
-    # Initialize the model if it was newly created
-    if 'textcat_multilabel' not in nlp.pipe_names:
-        nlp.initialize(lambda: train_examples)
-    else:
-        print("Continuing training with existing model.")
-
-    # Train the model
-    random.seed(1)
-    spacy.util.fix_random_seed(1)
-    num_epochs = 5 + additional_epochs
-    for i in range(num_epochs):
-        random.shuffle(train_examples)
-        losses = {}
-        batches = spacy.util.minibatch(train_examples, size=8)
-        for batch in batches:
-            nlp.update(batch, drop=0.2, losses=losses)
-        print(f"Losses at iteration {i}: {losses}")
-
-    # Save the trained model
-    nlp.to_disk(model_dir)
-    print(f"Model saved to: {model_dir}")
-
-def load_model_and_predict(model_dir, text, tok_k = 3):
-    # Load the trained model from the specified directory
-    nlp = spacy.load(model_dir)
-    
-    # Process the text with the loaded model
-    doc = nlp(text)
-    
-    # gee top 3 categories
-    top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
-    print(f"Top {tok_k} categories:")
-    
-    return top_categories    
-
-if __name__ == "__main__":
-    train_and_save_reuters_model()
-    train_model("models/reuters", additional_epochs=5)
-    model_directory = "reuters_model_10"
-    print(reuters.categories())
-    example_text = "Apple Inc. is reportedly buying a startup for $1 billion"
-    r =load_model_and_predict(model_directory, example_text)
-    print(r)
--- a/crawl4ai/web_crawler.back.py
+++ b/crawl4ai/web_crawler.back.py
@ -1,357 +0,0 @@
-import os, time
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-from pathlib import Path
-
-from .models import UrlModel, CrawlResult
-from .database import init_db, get_cached_url, cache_url, DB_PATH, flush_db
-from .utils import *
-from .chunking_strategy import *
-from .extraction_strategy import *
-from .crawler_strategy import *
-from typing import List
-from concurrent.futures import ThreadPoolExecutor
-from .config import *
-
-
-class WebCrawler:
-    def __init__(
-        self,
-        # db_path: str = None,
-        crawler_strategy: CrawlerStrategy = None,
-        always_by_pass_cache: bool = False,
-        verbose: bool = False,
-    ):
-        # self.db_path = db_path
-        self.crawler_strategy = crawler_strategy or LocalSeleniumCrawlerStrategy(verbose=verbose)
-        self.always_by_pass_cache = always_by_pass_cache
-
-        # Create the .crawl4ai folder in the user's home directory if it doesn't exist
-        self.crawl4ai_folder = os.path.join(Path.home(), ".crawl4ai")
-        os.makedirs(self.crawl4ai_folder, exist_ok=True)
-        os.makedirs(f"{self.crawl4ai_folder}/cache", exist_ok=True)
-
-        # If db_path is not provided, use the default path
-        # if not db_path:
-            # self.db_path = f"{self.crawl4ai_folder}/crawl4ai.db"
-        
-        # flush_db()
-        init_db()
-        
-        self.ready = False
-        
-    def warmup(self):
-        print("[LOG] 🌤️  Warming up the WebCrawler")
-        result = self.run(
-            url='https://crawl4ai.uccode.io/',
-            word_count_threshold=5,
-            extraction_strategy= NoExtractionStrategy(),
-            bypass_cache=False,
-            verbose = False
-        )
-        self.ready = True
-        print("[LOG] 🌞 WebCrawler is ready to crawl")
-        
-    def fetch_page(
-        self,
-        url_model: UrlModel,
-        provider: str = DEFAULT_PROVIDER,
-        api_token: str = None,
-        extract_blocks_flag: bool = True,
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        css_selector: str = None,
-        screenshot: bool = False,
-        use_cached_html: bool = False,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        **kwargs,
-    ) -> CrawlResult:
-        return self.run(
-            url_model.url,
-            word_count_threshold,
-            extraction_strategy or NoExtractionStrategy(),
-            chunking_strategy,
-            bypass_cache=url_model.forced,
-            css_selector=css_selector,
-            screenshot=screenshot,
-            **kwargs,
-        )
-        pass
-
-    def run_old(
-        self,
-        url: str,
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        bypass_cache: bool = False,
-        css_selector: str = None,
-        screenshot: bool = False,
-        user_agent: str = None,
-        verbose=True,
-        **kwargs,
-    ) -> CrawlResult:
-        if user_agent:
-            self.crawler_strategy.update_user_agent(user_agent)
-        extraction_strategy = extraction_strategy or NoExtractionStrategy()
-        extraction_strategy.verbose = verbose
-        # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
-        if not isinstance(extraction_strategy, ExtractionStrategy):
-            raise ValueError("Unsupported extraction strategy")
-        if not isinstance(chunking_strategy, ChunkingStrategy):
-            raise ValueError("Unsupported chunking strategy")
-        
-        # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
-        if word_count_threshold < MIN_WORD_THRESHOLD:
-            word_count_threshold = MIN_WORD_THRESHOLD
-
-        # Check cache first
-        if not bypass_cache and not self.always_by_pass_cache:
-            cached = get_cached_url(url)
-            if cached:
-                return CrawlResult(
-                    **{
-                        "url": cached[0],
-                        "html": cached[1],
-                        "cleaned_html": cached[2],
-                        "markdown": cached[3],
-                        "extracted_content": cached[4],
-                        "success": cached[5],
-                        "media": json.loads(cached[6] or "{}"),
-                        "links": json.loads(cached[7] or "{}"),
-                        "metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
-                        "screenshot": cached[9],
-                        "error_message": "",
-                    }
-                )
-
-        # Initialize WebDriver for crawling
-        t = time.time()
-        if kwargs.get("js", None):
-            self.crawler_strategy.js_code = kwargs.get("js")
-        html = self.crawler_strategy.crawl(url)
-        base64_image = None
-        if screenshot:
-            base64_image = self.crawler_strategy.take_screenshot()
-        success = True
-        error_message = ""
-        # Extract content from HTML
-        try:
-            result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
-            metadata = extract_metadata(html)
-            if result is None:
-                raise ValueError(f"Failed to extract content from the website: {url}")
-        except InvalidCSSSelectorError as e:
-            raise ValueError(str(e))
-        
-        cleaned_html = result.get("cleaned_html", "")
-        markdown = result.get("markdown", "")
-        media = result.get("media", [])
-        links = result.get("links", [])
-
-        # Print a profession LOG style message, show time taken and say crawling is done
-        if verbose:
-            print(
-                f"[LOG] 🚀 Crawling done for {url}, success: {success}, time taken: {time.time() - t} seconds"
-            )
-
-        extracted_content = []
-        if verbose:
-            print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
-        t = time.time()
-        # Split markdown into sections
-        sections = chunking_strategy.chunk(markdown)
-        # sections = merge_chunks_based_on_token_threshold(sections, CHUNK_TOKEN_THRESHOLD)
-
-        extracted_content = extraction_strategy.run(
-            url, sections,
-        )
-        extracted_content = json.dumps(extracted_content)
-
-        if verbose:
-            print(
-                f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds."
-            )
-
-        # Cache the result
-        cleaned_html = beautify_html(cleaned_html)
-        cache_url(
-            url,
-            html,
-            cleaned_html,
-            markdown,
-            extracted_content,
-            success,
-            json.dumps(media),
-            json.dumps(links),
-            json.dumps(metadata),
-            screenshot=base64_image,
-        )
-
-        return CrawlResult(
-            url=url,
-            html=html,
-            cleaned_html=cleaned_html,
-            markdown=markdown,
-            media=media,
-            links=links,
-            metadata=metadata,
-            screenshot=base64_image,
-            extracted_content=extracted_content,
-            success=success,
-            error_message=error_message,
-        )
-
-    def fetch_pages(
-        self,
-        url_models: List[UrlModel],
-        provider: str = DEFAULT_PROVIDER,
-        api_token: str = None,
-        extract_blocks_flag: bool = True,
-        word_count_threshold=MIN_WORD_THRESHOLD,
-        use_cached_html: bool = False,
-        css_selector: str = None,
-        screenshot: bool = False,
-        extraction_strategy: ExtractionStrategy = None,
-        chunking_strategy: ChunkingStrategy = RegexChunking(),
-        **kwargs,
-    ) -> List[CrawlResult]:
-        extraction_strategy = extraction_strategy or NoExtractionStrategy()
-        def fetch_page_wrapper(url_model, *args, **kwargs):
-            return self.fetch_page(url_model, *args, **kwargs)
-
-        with ThreadPoolExecutor() as executor:
-            results = list(
-                executor.map(
-                    fetch_page_wrapper,
-                    url_models,
-                    [provider] * len(url_models),
-                    [api_token] * len(url_models),
-                    [extract_blocks_flag] * len(url_models),
-                    [word_count_threshold] * len(url_models),
-                    [css_selector] * len(url_models),
-                    [screenshot] * len(url_models),
-                    [use_cached_html] * len(url_models),
-                    [extraction_strategy] * len(url_models),
-                    [chunking_strategy] * len(url_models),
-                    *[kwargs] * len(url_models),
-                )
-            )
-
-        return results
-
-    def run(
-            self,
-            url: str,
-            word_count_threshold=MIN_WORD_THRESHOLD,
-            extraction_strategy: ExtractionStrategy = None,
-            chunking_strategy: ChunkingStrategy = RegexChunking(),
-            bypass_cache: bool = False,
-            css_selector: str = None,
-            screenshot: bool = False,
-            user_agent: str = None,
-            verbose=True,
-            **kwargs,
-        ) -> CrawlResult:
-            extraction_strategy = extraction_strategy or NoExtractionStrategy()
-            extraction_strategy.verbose = verbose
-            if not isinstance(extraction_strategy, ExtractionStrategy):
-                raise ValueError("Unsupported extraction strategy")
-            if not isinstance(chunking_strategy, ChunkingStrategy):
-                raise ValueError("Unsupported chunking strategy")
-            
-            if word_count_threshold < MIN_WORD_THRESHOLD:
-                word_count_threshold = MIN_WORD_THRESHOLD
-
-            # Check cache first
-            cached = None
-            extracted_content = None
-            if not bypass_cache and not self.always_by_pass_cache:
-                cached = get_cached_url(url)
-            
-            if cached:
-                html = cached[1]
-                extracted_content = cached[2]
-                if screenshot:
-                    screenshot = cached[9]
-            
-            else:
-                if user_agent:
-                    self.crawler_strategy.update_user_agent(user_agent)
-                html = self.crawler_strategy.crawl(url)
-                if screenshot:
-                    screenshot = self.crawler_strategy.take_screenshot()
-            
-            return self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot, verbose, bool(cached), **kwargs)
-
-    def process_html(
-            self,
-            url: str,
-            html: str,
-            extracted_content: str,
-            word_count_threshold: int,
-            extraction_strategy: ExtractionStrategy,
-            chunking_strategy: ChunkingStrategy,
-            css_selector: str,
-            screenshot: bool,
-            verbose: bool,
-            is_cached: bool,
-            **kwargs,
-        ) -> CrawlResult:
-            t = time.time()
-            # Extract content from HTML
-            try:
-                result = get_content_of_website(url, html, word_count_threshold, css_selector=css_selector)
-                metadata = extract_metadata(html)
-                if result is None:
-                    raise ValueError(f"Failed to extract content from the website: {url}")
-            except InvalidCSSSelectorError as e:
-                raise ValueError(str(e))
-            
-            cleaned_html = result.get("cleaned_html", "")
-            markdown = result.get("markdown", "")
-            media = result.get("media", [])
-            links = result.get("links", [])
-
-            if verbose:
-                print(f"[LOG] 🚀 Crawling done for {url}, success: True, time taken: {time.time() - t} seconds")
-                        
-            if extracted_content is None:
-                if verbose:
-                    print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}")
-
-                sections = chunking_strategy.chunk(markdown)
-                extracted_content = extraction_strategy.run(url, sections)
-                extracted_content = json.dumps(extracted_content)
-
-                if verbose:
-                    print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t} seconds.")
-                
-            screenshot = None if not screenshot else screenshot
-            
-            if not is_cached:
-                cache_url(
-                    url,
-                    html,
-                    cleaned_html,
-                    markdown,
-                    extracted_content,
-                    True,
-                    json.dumps(media),
-                    json.dumps(links),
-                    json.dumps(metadata),
-                    screenshot=screenshot,
-                )                
-
-            return CrawlResult(
-                url=url,
-                html=html,
-                cleaned_html=cleaned_html,
-                markdown=markdown,
-                media=media,
-                links=links,
-                metadata=metadata,
-                screenshot=screenshot,
-                extracted_content=extracted_content,
-                success=True,
-                error_message="",
-            )