diff --git a/ch07/02_dataset-utilities/README.md b/ch07/02_dataset-utilities/README.md
index 7e217a8..4b4066a 100644
--- a/ch07/02_dataset-utilities/README.md
+++ b/ch07/02_dataset-utilities/README.md
@@ -11,8 +11,8 @@ pip install -r requirements-extra.txt
 
 
 
-&nbsp;
-## Finding Near-duplicates
+
+### Finding Near Duplicates
 
 The `find-near-duplicates.py` function can be used to identify duplicates and near-duplicates in an instruction dataset. For example,
 
@@ -27,13 +27,9 @@ scikit-learn version: 1.3.1
 
 
 ==================================================
- Searching 'instruction' for duplicates ...
+Searching 'instruction' for duplicates ...
 ==================================================
-Duplicate pair found with similarity 0.85:
-1. Determine the state of matter for helium at room temperature.
-2. Determine the state of matter for nitrogen at room temperature.
-
-Duplicate pair found with similarity 0.98:
+Duplicate pair found with similarity 0.94:
 1. Edit the following sentence to make it more formal.
 2. Edit the sentence to make it more formal.
 
@@ -41,28 +37,21 @@ Duplicate pair found with similarity 1.00:
 1. Name a dwarf planet in our solar system.
 2. Name a dwarf planet in our solar system.
 
-Duplicate pair found with similarity 0.88:
+Duplicate pair found with similarity 0.91:
 1. Change the sentences from active voice to passive voice.
 2. Change the sentence from passive to active voice.
 
 
 
 ==================================================
- Searching 'input' for duplicates ...
+Searching 'input' for duplicates ...
 ==================================================
-Duplicate pair found with similarity 0.88:
-1. 
-2. She said, "I am tired."
-
+No duplicates found
 
 
 ==================================================
- Searching 'output' for duplicates ...
+Searching 'output' for duplicates ...
 ==================================================
-Duplicate pair found with similarity 0.82:
-1. Helium is in a gaseous state at room temperature.
-2. Nitrogen is in a gaseous state at room temperature.
-
 Duplicate pair found with similarity 1.00:
 1. One dwarf planet in our solar system is Pluto.
 2. One dwarf planet in our solar system is Pluto.
@@ -70,17 +59,22 @@ Duplicate pair found with similarity 1.00:
 
 ```
 
+&nbsp;
+You can use the `--threshold` setting with a value between 0 and 1 to decrease or increase the sensitivity.
+The default threshold is 0.9.
+
+
 
 &nbsp;
-## Creating Passive Voice Entries
+ ## Creating Passive Voice Entries
 
-- The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below
+ - The [create-passive-voice-entries.ipynb](create-passive-voice-entries.ipynb) notebook uses OpenAI's GPT-4 to create "passive voice" entries for an instruction dataset, as shown in the example below
 
-```python
-{  
-   'instruction': 'Identify the verb in the following sentence',
-   'input': 'The cat sleeps on the couch.',
-   'output': 'The verb in the sentence is "sleeps."',
-   'output_2': 'The sentence is "sleeps."'   #  <---- Newly created entry
-}  
-```
+ ```python
+ {  
+    'instruction': 'Identify the verb in the following sentence',
+    'input': 'The cat sleeps on the couch.',
+    'output': 'The verb in the sentence is "sleeps."',
+    'output_2': 'The sentence is "sleeps."'   #  <---- Newly created entry
+ }  
+ ```
diff --git a/ch07/02_dataset-utilities/find-near-duplicates.py b/ch07/02_dataset-utilities/find-near-duplicates.py
index 05beebe..45b2fce 100644
--- a/ch07/02_dataset-utilities/find-near-duplicates.py
+++ b/ch07/02_dataset-utilities/find-near-duplicates.py
@@ -6,6 +6,7 @@
 
 import argparse
 import json
+import re
 from sklearn import __version__ as sklearn_version
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
@@ -29,18 +30,27 @@ example_data = [
 ]
 
 
-def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
+def preprocess_text(text):
+    # Lowercase the text
+    text = text.lower()
+    # Remove punctuation
+    text = re.sub(r'[^\w\s]', '', text)
+    return text
+
+
+def find_near_duplicates(json_data, threshold=0.75, key="instruction"):
     """The higher the threshold, the more similar the texts have to be to match"""
 
     # Extract instructions
-    text = [item[key] for item in json_data if item[key]]
+    text = [preprocess_text(item[key]) for item in json_data if item[key]]
     near_duplicates = []
+    indices_to_remove = set()
 
     if not text:
-        return near_duplicates
+        return {}, near_duplicates
 
     # Vectorize the text data
-    vectorizer = TfidfVectorizer(stop_words=None)
+    vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3))
     tfidf_matrix = vectorizer.fit_transform(text)
 
     # Compute cosine similarity between each pair of entries
@@ -51,18 +61,29 @@ def find_near_duplicates(json_data, threshold=0.8, key="instruction"):
     for i in range(len(cos_sim_matrix)):
         for j in range(i+1, len(cos_sim_matrix)):
             if cos_sim_matrix[i, j] > threshold:
+                if len(json_data[i][key]) <= 1 or len(json_data[j][key]) <= 1:
+                    continue
                 near_duplicates.append((json_data[i], json_data[j], cos_sim_matrix[i, j]))
+                if key in ("input", "output"):  # Don't remove duplicates based on the instruction
+                    indices_to_remove.add(j)  # Mark the second entry for removal
 
-    return near_duplicates
+    # Remove the near-duplicate entries
+    filtered_json_data = [item for index, item in enumerate(json_data) if index not in indices_to_remove]
+
+    return filtered_json_data, near_duplicates
 
 
-def find_and_print_new_duplicates(json_data):
+def find_print_and_remove_near_duplicates(json_data, remove_duplicates=False, threshold=0.75):
     """
     Searches each key in the first JSON object for duplicates across a list of JSON objects.
     Prints the duplicates if found.
     """
     for key in json_data[0].keys():
-        near_duplicates = find_near_duplicates(json_data, key=key)
+
+        if remove_duplicates:
+            json_data, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold)
+        else:
+            _, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold)
         separator = 50 * '='
         print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}")
         if not near_duplicates:
@@ -73,6 +94,7 @@ def find_and_print_new_duplicates(json_data):
                     f"Duplicate pair found with similarity {dup[2]:.2f}:\n"
                     f"1. {dup[0][key]}\n2. {dup[1][key]}\n"
                 )
+    return json_data
 
 
 if __name__ == "__main__":
@@ -84,7 +106,35 @@ if __name__ == "__main__":
         type=str,
         help=("Path to the dataset JSON file")
     )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.9,
+        help=("A sensitivity threshold between 0 and 1 where 1 is strictest")
+    )
+    parser.add_argument(
+        "--remove_duplicates",
+        action='store_true',
+        default=False,
+        help=(
+            "Removes duplicates based on the 'input' or 'output' keys "
+            " (but not the 'instruction') and saves the cleaned JSON file as --json_output_file"
+        )
+    )
+    parser.add_argument(
+        "--json_output_file",
+        type=str,
+        help=("Path to the dataset JSON file")
+    )
+
     args = parser.parse_args()
+
+    if args.remove_duplicates and not args.json_output_file:
+        raise ValueError(
+            "Provide an output file via --json_output_file "
+            "to save the cleaned JSON data."
+        )
+
     if not args.json_file:
         json_data = example_data
 
@@ -92,4 +142,12 @@ if __name__ == "__main__":
         with open(args.json_file, "r") as file:
             json_data = json.load(file)
 
-    find_and_print_new_duplicates(json_data)
+    json_data = find_print_and_remove_near_duplicates(
+        json_data=json_data,
+        remove_duplicates=args.remove_duplicates,
+        threshold=args.threshold
+    )
+
+    if args.remove_duplicates:
+        with open(args.json_output_file, "w") as file:
+            json.dump(json_data, file, indent=4)