diff --git a/ch07/02_dataset-utilities/README.md b/ch07/02_dataset-utilities/README.md index 7e217a8..4b4066a 100644 --- a/ch07/02_dataset-utilities/README.md +++ b/ch07/02_dataset-utilities/README.md @@ -11,8 +11,8 @@ pip install -r requirements-extra.txt -  -## Finding Near-duplicates + +### Finding Near Duplicates The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example, @@ -27,13 +27,9 @@ scikit-learn version: 1.3.1 ================================================== - Searching 'instruction' for duplicates ... +Searching 'instruction' for duplicates ... ================================================== -Duplicate pair found with similarity 0.85: -1. Determine the state of matter for helium at room temperature. -2. Determine the state of matter for nitrogen at room temperature. - -Duplicate pair found with similarity 0.98: +Duplicate pair found with similarity 0.94: 1. Edit the following sentence to make it more formal. 2. Edit the sentence to make it more formal. @@ -41,28 +37,21 @@ Duplicate pair found with similarity 1.00: 1. Name a dwarf planet in our solar system. 2. Name a dwarf planet in our solar system. -Duplicate pair found with similarity 0.88: +Duplicate pair found with similarity 0.91: 1. Change the sentences from active voice to passive voice. 2. Change the sentence from passive to active voice. ================================================== - Searching 'input' for duplicates ... +Searching 'input' for duplicates ... ================================================== -Duplicate pair found with similarity 0.88: -1. -2. She said, "I am tired." - +No duplicates found ================================================== - Searching 'output' for duplicates ... +Searching 'output' for duplicates ... ================================================== -Duplicate pair found with similarity 0.82: -1. Helium is in a gaseous state at room temperature. -2. Nitrogen is in a gaseous state at room temperature. - Duplicate pair found with similarity 1.00: 1. One dwarf planet in our solar system is Pluto. 2. One dwarf planet in our solar system is Pluto. @@ -70,17 +59,22 @@ Duplicate pair found with similarity 1.00: ``` +  +You can use the `--threshold` setting with a value between 0 and 1 to decrease or increase the sensitivity. +The default threshold is 0.9. + +   -## Creating Passive Voice Entries + ## Creating Passive Voice Entries -- The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below + - The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below -```python -{ - 'instruction': 'Identify the verb in the following sentence', - 'input': 'The cat sleeps on the couch.', - 'output': 'The verb in the sentence is "sleeps."', - 'output_2': 'The sentence is "sleeps."' # <---- Newly created entry -} -``` + ```python + { + 'instruction': 'Identify the verb in the following sentence', + 'input': 'The cat sleeps on the couch.', + 'output': 'The verb in the sentence is "sleeps."', + 'output_2': 'The sentence is "sleeps."' # <---- Newly created entry + } + ``` diff --git a/ch07/02_dataset-utilities/find-near-duplicates.py b/ch07/02_dataset-utilities/find-near-duplicates.py index 05beebe..45b2fce 100644 --- a/ch07/02_dataset-utilities/find-near-duplicates.py +++ b/ch07/02_dataset-utilities/find-near-duplicates.py @@ -6,6 +6,7 @@ import argparse import json +import re from sklearn import __version__ as sklearn_version from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity @@ -29,18 +30,27 @@ example_data = [ ] -def find_near_duplicates(json_data, threshold=0.8, key="instruction"): +def preprocess_text(text): + # Lowercase the text + text = text.lower() + # Remove punctuation + text = re.sub(r'[^\w\s]', '', text) + return text + + +def find_near_duplicates(json_data, threshold=0.75, key="instruction"): """The higher the threshold, the more similar the texts have to be to match""" # Extract instructions - text = [item[key] for item in json_data if item[key]] + text = [preprocess_text(item[key]) for item in json_data if item[key]] near_duplicates = [] + indices_to_remove = set() if not text: - return near_duplicates + return {}, near_duplicates # Vectorize the text data - vectorizer = TfidfVectorizer(stop_words=None) + vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3)) tfidf_matrix = vectorizer.fit_transform(text) # Compute cosine similarity between each pair of entries @@ -51,18 +61,29 @@ def find_near_duplicates(json_data, threshold=0.8, key="instruction"): for i in range(len(cos_sim_matrix)): for j in range(i+1, len(cos_sim_matrix)): if cos_sim_matrix[i, j] > threshold: + if len(json_data[i][key]) <= 1 or len(json_data[j][key]) <= 1: + continue near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j])) + if key in ("input", "output"): # Don't remove duplicates based on the instruction + indices_to_remove.add(j) # Mark the second entry for removal - return near_duplicates + # Remove the near-duplicate entries + filtered_json_data = [item for index, item in enumerate(json_data) if index not in indices_to_remove] + + return filtered_json_data, near_duplicates -def find_and_print_new_duplicates(json_data): +def find_print_and_remove_near_duplicates(json_data, remove_duplicates=False, threshold=0.75): """ Searches each key in the first JSON object for duplicates across a list of JSON objects. Prints the duplicates if found. """ for key in json_data[0].keys(): - near_duplicates = find_near_duplicates(json_data, key=key) + + if remove_duplicates: + json_data, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold) + else: + _, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold) separator = 50 * '=' print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}") if not near_duplicates: @@ -73,6 +94,7 @@ def find_and_print_new_duplicates(json_data): f"Duplicate pair found with similarity {dup[2]:.2f}:\n" f"1. {dup[0][key]}\n2. {dup[1][key]}\n" ) + return json_data if __name__ == "__main__": @@ -84,7 +106,35 @@ if __name__ == "__main__": type=str, help=("Path to the dataset JSON file") ) + parser.add_argument( + "--threshold", + type=float, + default=0.9, + help=("A sensitivity threshold between 0 and 1 where 1 is strictest") + ) + parser.add_argument( + "--remove_duplicates", + action='store_true', + default=False, + help=( + "Removes duplicates based on the 'input' or 'output' keys " + " (but not the 'instruction') and saves the cleaned JSON file as --json_output_file" + ) + ) + parser.add_argument( + "--json_output_file", + type=str, + help=("Path to the dataset JSON file") + ) + args = parser.parse_args() + + if args.remove_duplicates and not args.json_output_file: + raise ValueError( + "Provide an output file via --json_output_file " + "to save the cleaned JSON data." + ) + if not args.json_file: json_data = example_data @@ -92,4 +142,12 @@ if __name__ == "__main__": with open(args.json_file, "r") as file: json_data = json.load(file) - find_and_print_new_duplicates(json_data) + json_data = find_print_and_remove_near_duplicates( + json_data=json_data, + remove_duplicates=args.remove_duplicates, + threshold=args.threshold + ) + + if args.remove_duplicates: + with open(args.json_output_file, "w") as file: + json.dump(json_data, file, indent=4)