LLMs-from-scratch/ch07/02_dataset-utilities/find-near-duplicates.py


# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
# Source for "Build a Large Language Model From Scratch"
#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
# Code: https://github.com/rasbt/LLMs-from-scratch

import argparse
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# Sample JSON dataset
example_data = [
    {"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."},
    {"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."},
    {"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."},
    {"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""},
    # Add other entries...
]


def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
    """The higher the threshold, the more similar the texts have to be to match"""

    # Extract instructions
    text = [item[key] for item in json_data if item[key]]
    near_duplicates = []

    if not text:
        return near_duplicates

    # Vectorize the text data
    vectorizer = TfidfVectorizer(stop_words=None)
    tfidf_matrix = vectorizer.fit_transform(text)

    # Compute cosine similarity between each pair of entries
    cos_sim_matrix = cosine_similarity(tfidf_matrix)

    # Find pairs of near-duplicate instructions based on the threshold

    for i in range(len(cos_sim_matrix)):
        for j in range(i+1, len(cos_sim_matrix)):
            if cos_sim_matrix[i, j] > threshold:
                near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))

    return near_duplicates


def find_and_print_new_duplicates(json_data):
    for key in json_data[0].keys():
        near_duplicates = find_near_duplicates(json_data, key=key)
        print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}")
        if not near_duplicates:
            print("No duplicates found")
        else:
            for dup in near_duplicates:
                print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
                      f"1. {dup[0][key]}\n2. {dup[1][key]}\n")


if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--json_file",
        type=str,
        help=("Path to the dataset JSON file")
        )
    args = parser.parse_args()
    if not args.json_file:
        json_data = example_data

    else:
        with open(args.json_file, "r") as file:
            json_data = json.load(file)

    find_and_print_new_duplicates(json_data)
dataset utils 2024-05-25 11:22:51 -05:00
			`# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).`
			`# Source for "Build a Large Language Model From Scratch"`
			`# - https://www.manning.com/books/build-a-large-language-model-from-scratch`
			`# Code: https://github.com/rasbt/LLMs-from-scratch`

			`import argparse`
			`import json`
			`from sklearn.feature_extraction.text import TfidfVectorizer`
			`from sklearn.metrics.pairwise import cosine_similarity`


			`# Sample JSON dataset`
			`example_data = [`
			`{"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."},`
			`{"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."},`
			`{"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."},`
			`{"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""},`
			`# Add other entries...`
			`]`


			`def find_near_duplicates(json_data, threshold=0.8, key="instruction"):`
			`"""The higher the threshold, the more similar the texts have to be to match"""`

			`# Extract instructions`
			`text = [item[key] for item in json_data if item[key]]`
			`near_duplicates = []`

			`if not text:`
			`return near_duplicates`

			`# Vectorize the text data`
			`vectorizer = TfidfVectorizer(stop_words=None)`
			`tfidf_matrix = vectorizer.fit_transform(text)`

			`# Compute cosine similarity between each pair of entries`
			`cos_sim_matrix = cosine_similarity(tfidf_matrix)`

			`# Find pairs of near-duplicate instructions based on the threshold`

			`for i in range(len(cos_sim_matrix)):`
			`for j in range(i+1, len(cos_sim_matrix)):`
			`if cos_sim_matrix[i, j] > threshold:`
			`near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))`

			`return near_duplicates`


			`def find_and_print_new_duplicates(json_data):`
			`for key in json_data[0].keys():`
			`near_duplicates = find_near_duplicates(json_data, key=key)`
			`print(f"\n\n{50'='}\n Searching '{key}' for duplicates ...\n{50'='}")`
			`if not near_duplicates:`
			`print("No duplicates found")`
			`else:`
			`for dup in near_duplicates:`
			`print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n"`
			`f"1. {dup[0][key]}\n2. {dup[1][key]}\n")`


			`if __name__ == "__main__":`

			`parser = argparse.ArgumentParser()`
			`parser.add_argument(`
			`"--json_file",`
			`type=str,`
			`help=("Path to the dataset JSON file")`
			`)`
			`args = parser.parse_args()`
			`if not args.json_file:`
			`json_data = example_data`

			`else:`
			`with open(args.json_file, "r") as file:`
			`json_data = json.load(file)`

			`find_and_print_new_duplicates(json_data)`