mirror of
				https://github.com/rasbt/LLMs-from-scratch.git
				synced 2025-11-03 19:30:26 +00:00 
			
		
		
		
	dataset utils
This commit is contained in:
		
							parent
							
								
									49b8e2e767
								
							
						
					
					
						commit
						abdb2fc61f
					
				
							
								
								
									
										63
									
								
								ch07/02_dataset-utilities/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										63
									
								
								ch07/02_dataset-utilities/README.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,63 @@
 | 
			
		||||
# Chapter 7: Instruction and Preference Finetuning
 | 
			
		||||
 | 
			
		||||
This folder contains utility code that can be used for preparing an instruction dataset.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
### Finding near duplicates
 | 
			
		||||
 | 
			
		||||
The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example,
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
python find-near-duplicates.py --json_file instruction-examples.json
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
==================================================
 | 
			
		||||
 Searching 'instruction' for duplicates ...
 | 
			
		||||
==================================================
 | 
			
		||||
Duplicate pair found with similarity 0.85:
 | 
			
		||||
1. Determine the state of matter for helium at room temperature.
 | 
			
		||||
2. Determine the state of matter for nitrogen at room temperature.
 | 
			
		||||
 | 
			
		||||
Duplicate pair found with similarity 0.98:
 | 
			
		||||
1. Edit the following sentence to make it more formal.
 | 
			
		||||
2. Edit the sentence to make it more formal.
 | 
			
		||||
 | 
			
		||||
Duplicate pair found with similarity 1.00:
 | 
			
		||||
1. Name a dwarf planet in our solar system.
 | 
			
		||||
2. Name a dwarf planet in our solar system.
 | 
			
		||||
 | 
			
		||||
Duplicate pair found with similarity 0.88:
 | 
			
		||||
1. Change the sentences from active voice to passive voice.
 | 
			
		||||
2. Change the sentence from passive to active voice.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
==================================================
 | 
			
		||||
 Searching 'input' for duplicates ...
 | 
			
		||||
==================================================
 | 
			
		||||
Duplicate pair found with similarity 0.88:
 | 
			
		||||
1. 
 | 
			
		||||
2. She said, "I am tired."
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
==================================================
 | 
			
		||||
 Searching 'output' for duplicates ...
 | 
			
		||||
==================================================
 | 
			
		||||
Duplicate pair found with similarity 0.82:
 | 
			
		||||
1. Helium is in a gaseous state at room temperature.
 | 
			
		||||
2. Nitrogen is in a gaseous state at room temperature.
 | 
			
		||||
 | 
			
		||||
Duplicate pair found with similarity 1.00:
 | 
			
		||||
1. One dwarf planet in our solar system is Pluto.
 | 
			
		||||
2. One dwarf planet in our solar system is Pluto.
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										78
									
								
								ch07/02_dataset-utilities/find-near-duplicates.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										78
									
								
								ch07/02_dataset-utilities/find-near-duplicates.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,78 @@
 | 
			
		||||
 | 
			
		||||
# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt).
 | 
			
		||||
# Source for "Build a Large Language Model From Scratch"
 | 
			
		||||
#   - https://www.manning.com/books/build-a-large-language-model-from-scratch
 | 
			
		||||
# Code: https://github.com/rasbt/LLMs-from-scratch
 | 
			
		||||
 | 
			
		||||
import argparse
 | 
			
		||||
import json
 | 
			
		||||
from sklearn.feature_extraction.text import TfidfVectorizer
 | 
			
		||||
from sklearn.metrics.pairwise import cosine_similarity
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
# Sample JSON dataset
 | 
			
		||||
example_data = [
 | 
			
		||||
    {"instruction": "What is the capital of Italy?", "input": "", "output": "The capital of Italy is Rome."},
 | 
			
		||||
    {"instruction": "What's the capital city of Italy?", "input": "", "output": "The capital city is Rome."},
 | 
			
		||||
    {"instruction": "Identify the main verb in the sentence: 'The cat sleeps on the couch.'", "input": "", "output": "The verb is 'sleeps'."},
 | 
			
		||||
    {"instruction": "Identify the verb in the following sentence: The cat sleeps on the couch.", "input": "", "output": "The verb in the sentence is \"sleeps.\""},
 | 
			
		||||
    # Add other entries...
 | 
			
		||||
]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
 | 
			
		||||
    """The higher the threshold, the more similar the texts have to be to match"""
 | 
			
		||||
 | 
			
		||||
    # Extract instructions
 | 
			
		||||
    text = [item[key] for item in json_data if item[key]]
 | 
			
		||||
    near_duplicates = []
 | 
			
		||||
 | 
			
		||||
    if not text:
 | 
			
		||||
        return near_duplicates
 | 
			
		||||
 | 
			
		||||
    # Vectorize the text data
 | 
			
		||||
    vectorizer = TfidfVectorizer(stop_words=None)
 | 
			
		||||
    tfidf_matrix = vectorizer.fit_transform(text)
 | 
			
		||||
 | 
			
		||||
    # Compute cosine similarity between each pair of entries
 | 
			
		||||
    cos_sim_matrix = cosine_similarity(tfidf_matrix)
 | 
			
		||||
 | 
			
		||||
    # Find pairs of near-duplicate instructions based on the threshold
 | 
			
		||||
 | 
			
		||||
    for i in range(len(cos_sim_matrix)):
 | 
			
		||||
        for j in range(i+1, len(cos_sim_matrix)):
 | 
			
		||||
            if cos_sim_matrix[i, j] > threshold:
 | 
			
		||||
                near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))
 | 
			
		||||
 | 
			
		||||
    return near_duplicates
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def find_and_print_new_duplicates(json_data):
 | 
			
		||||
    for key in json_data[0].keys():
 | 
			
		||||
        near_duplicates = find_near_duplicates(json_data, key=key)
 | 
			
		||||
        print(f"\n\n{50*'='}\n Searching '{key}' for duplicates ...\n{50*'='}")
 | 
			
		||||
        if not near_duplicates:
 | 
			
		||||
            print("No duplicates found")
 | 
			
		||||
        else:
 | 
			
		||||
            for dup in near_duplicates:
 | 
			
		||||
                print(f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
 | 
			
		||||
                      f"1. {dup[0][key]}\n2. {dup[1][key]}\n")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == "__main__":
 | 
			
		||||
 | 
			
		||||
    parser = argparse.ArgumentParser()
 | 
			
		||||
    parser.add_argument(
 | 
			
		||||
        "--json_file",
 | 
			
		||||
        type=str,
 | 
			
		||||
        help=("Path to the dataset JSON file")
 | 
			
		||||
        )
 | 
			
		||||
    args = parser.parse_args()
 | 
			
		||||
    if not args.json_file:
 | 
			
		||||
        json_data = example_data
 | 
			
		||||
 | 
			
		||||
    else:
 | 
			
		||||
        with open(args.json_file, "r") as file:
 | 
			
		||||
            json_data = json.load(file)
 | 
			
		||||
 | 
			
		||||
    find_and_print_new_duplicates(json_data)
 | 
			
		||||
							
								
								
									
										1002
									
								
								ch07/02_dataset-utilities/instruction-examples.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1002
									
								
								ch07/02_dataset-utilities/instruction-examples.json
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										3
									
								
								ch07/README.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										3
									
								
								ch07/README.md
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,3 @@
 | 
			
		||||
# Chapter 7: Instruction and Preference Finetuning
 | 
			
		||||
 | 
			
		||||
In progress ...
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user