mirror of
https://github.com/rasbt/LLMs-from-scratch.git
synced 2025-10-26 23:39:53 +00:00
dataset utils
This commit is contained in:
parent
605b4b03d7
commit
477dea73ec
63
ch07/02_dataset-utilities/README.md
Normal file
63
ch07/02_dataset-utilities/README.md
Normal file
@ -0,0 +1,63 @@
|
||||
# Chapter 7: Instruction and Preference Finetuning
|
||||
|
||||
This folder contains utility code that can be used for preparing an instruction dataset.
|
||||
|
||||
|
||||
|
||||
### Finding near duplicates
|
||||
|
||||
The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example,
|
||||
|
||||
|
||||
|
||||
```python
|
||||
python find-near-duplicates.py --json_file instruction-examples.json
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
|
||||
==================================================
|
||||
Searching 'instruction' for duplicates ...
|
||||
==================================================
|
||||
Duplicate pair found with similarity 0.85:
|
||||
1. Determine the state of matter for helium at room temperature.
|
||||
2. Determine the state of matter for nitrogen at room temperature.
|
||||
|
||||
Duplicate pair found with similarity 0.98:
|
||||
1. Edit the following sentence to make it more formal.
|
||||
2. Edit the sentence to make it more formal.
|
||||
|
||||
Duplicate pair found with similarity 1.00:
|
||||
1. Name a dwarf planet in our solar system.
|
||||
2. Name a dwarf planet in our solar system.
|
||||
|
||||
Duplicate pair found with similarity 0.88:
|
||||
1. Change the sentences from active voice to passive voice.
|
||||
2. Change the sentence from passive to active voice.
|
||||
|
||||
|
||||
|
||||
==================================================
|
||||
Searching 'input' for duplicates ...
|
||||
==================================================
|
||||
Duplicate pair found with similarity 0.88:
|
||||
1.
|
||||
2. She said, "I am tired."
|
||||
|
||||
|
||||
|
||||
==================================================
|
||||
Searching 'output' for duplicates ...
|
||||
==================================================
|
||||
Duplicate pair found with similarity 0.82:
|
||||
1. Helium is in a gaseous state at room temperature.
|
||||
2. Nitrogen is in a gaseous state at room temperature.
|
||||
|
||||
Duplicate pair found with similarity 1.00:
|
||||
1. One dwarf planet in our solar system is Pluto.
|
||||
2. One dwarf planet in our solar system is Pluto.
|
||||
|
||||
|
||||
```
|
||||
|
||||
78
ch07/02_dataset-utilities/find-near-duplicates.py
Normal file
78
ch07/02_dataset-utilities/find-near-duplicates.py
Normal file
@ -0,0 +1,78 @@
|
||||
|
||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
|
||||
# Source for "Build a Large Language Model From Scratch"
|
||||
# - https://www.manning.com/books/build-a-large-language-model-from-scratch
|
||||
# Code: https://github.com/rasbt/LLMs-from-scratch
|
||||
|
||||
import argparse
|
||||
import json
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
|
||||
# Sample JSON dataset
|
||||
example_data = [
|
||||
{"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."},
|
||||
{"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."},
|
||||
{"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."},
|
||||
{"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""},
|
||||
# Add other entries...
|
||||
]
|
||||
|
||||
|
||||
def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
|
||||
"""The higher the threshold, the more similar the texts have to be to match"""
|
||||
|
||||
# Extract instructions
|
||||
text = [item[key] for item in json_data if item[key]]
|
||||
near_duplicates = []
|
||||
|
||||
if not text:
|
||||
return near_duplicates
|
||||
|
||||
# Vectorize the text data
|
||||
vectorizer = TfidfVectorizer(stop_words=None)
|
||||
tfidf_matrix = vectorizer.fit_transform(text)
|
||||
|
||||
# Compute cosine similarity between each pair of entries
|
||||
cos_sim_matrix = cosine_similarity(tfidf_matrix)
|
||||
|
||||
# Find pairs of near-duplicate instructions based on the threshold
|
||||
|
||||
for i in range(len(cos_sim_matrix)):
|
||||
for j in range(i+1, len(cos_sim_matrix)):
|
||||
if cos_sim_matrix[i, j] > threshold:
|
||||
near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))
|
||||
|
||||
return near_duplicates
|
||||
|
||||
|
||||
def find_and_print_new_duplicates(json_data):
|
||||
for key in json_data[0].keys():
|
||||
near_duplicates = find_near_duplicates(json_data, key=key)
|
||||
print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}")
|
||||
if not near_duplicates:
|
||||
print("No duplicates found")
|
||||
else:
|
||||
for dup in near_duplicates:
|
||||
print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
|
||||
f"1. {dup[0][key]}\n2. {dup[1][key]}\n")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--json_file",
|
||||
type=str,
|
||||
help=("Path to the dataset JSON file")
|
||||
)
|
||||
args = parser.parse_args()
|
||||
if not args.json_file:
|
||||
json_data = example_data
|
||||
|
||||
else:
|
||||
with open(args.json_file, "r") as file:
|
||||
json_data = json.load(file)
|
||||
|
||||
find_and_print_new_duplicates(json_data)
|
||||
1002
ch07/02_dataset-utilities/instruction-examples.json
Normal file
1002
ch07/02_dataset-utilities/instruction-examples.json
Normal file
File diff suppressed because it is too large
Load Diff
3
ch07/README.md
Normal file
3
ch07/README.md
Normal file
@ -0,0 +1,3 @@
|
||||
# Chapter 7: Instruction and Preference Finetuning
|
||||
|
||||
In progress ...
|
||||
Loading…
x
Reference in New Issue
Block a user