diff --git a/.github/scripts/check_double_quotes.py b/.github/scripts/check_double_quotes.py
new file mode 100644
index 0000000..ef43ff1
--- /dev/null
+++ b/.github/scripts/check_double_quotes.py
@@ -0,0 +1,158 @@
+# Copyright (c) Sebastian Raschka under Apache License 2.0 (see LICENSE.txt)
+# Source for "Build a Reasoning Model (From Scratch)": https://mng.bz/lZ5B
+# Code repository: https://github.com/rasbt/reasoning-from-scratch
+
+# Verify that Python source files (and optionally notebooks) use double quotes for strings.
+
+import argparse
+import ast
+import io
+import json
+import sys
+import tokenize
+from pathlib import Path
+
+EXCLUDED_DIRS = {
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".pytest_cache",
+    ".ruff_cache",
+    ".svn",
+    ".tox",
+    ".venv",
+    "__pycache__",
+    "build",
+    "dist",
+    "node_modules",
+}
+
+PREFIX_CHARS = {"r", "u", "f", "b"}
+SINGLE_QUOTE = "'"
+DOUBLE_QUOTE = "\""
+TRIPLE_SINGLE = SINGLE_QUOTE * 3
+TRIPLE_DOUBLE = DOUBLE_QUOTE * 3
+
+
+def should_skip(path):
+    parts = set(path.parts)
+    return bool(EXCLUDED_DIRS & parts)
+
+
+def collect_fstring_expr_string_positions(source):
+    """
+    Return set of (lineno, col_offset) for string literals that appear inside
+    formatted expressions of f-strings. These should be exempt from the double
+    quote check, since enforcing double quotes there is unnecessarily strict.
+    """
+    try:
+        tree = ast.parse(source)
+    except SyntaxError:
+        return set()
+
+    positions = set()
+
+    class Collector(ast.NodeVisitor):
+        def visit_JoinedStr(self, node):
+            for value in node.values:
+                if isinstance(value, ast.FormattedValue):
+                    self._collect_from_expr(value.value)
+            # Continue walking to catch nested f-strings within expressions
+            self.generic_visit(node)
+
+        def _collect_from_expr(self, node):
+            if isinstance(node, ast.Constant) and isinstance(node.value, str):
+                positions.add((node.lineno, node.col_offset))
+            elif isinstance(node, ast.Str):  # Python <3.8 compatibility
+                positions.add((node.lineno, node.col_offset))
+            else:
+                for child in ast.iter_child_nodes(node):
+                    self._collect_from_expr(child)
+
+    Collector().visit(tree)
+    return positions
+
+
+def check_quotes_in_source(source, path):
+    violations = []
+    ignored_positions = collect_fstring_expr_string_positions(source)
+    tokens = tokenize.generate_tokens(io.StringIO(source).readline)
+    for tok_type, tok_str, start, _, _ in tokens:
+        if tok_type == tokenize.STRING:
+            if start in ignored_positions:
+                continue
+            lowered = tok_str.lower()
+            # ignore triple-quoted strings
+            if lowered.startswith((TRIPLE_DOUBLE, TRIPLE_SINGLE)):
+                continue
+
+            # find the prefix and quote type
+            # prefix = ""
+            for c in PREFIX_CHARS:
+                if lowered.startswith(c):
+                    # prefix = c
+                    lowered = lowered[1:]
+                    break
+
+            # report if not using double quotes
+            if lowered.startswith(SINGLE_QUOTE):
+                line, col = start
+                violations.append(f"{path}:{line}:{col}: uses single quotes")
+    return violations
+
+
+def check_file(path):
+    try:
+        if path.suffix == ".ipynb":
+            return check_notebook(path)
+        else:
+            text = path.read_text(encoding="utf-8")
+            return check_quotes_in_source(text, path)
+    except Exception as e:
+        return [f"{path}: failed to check ({e})"]
+
+
+def check_notebook(path):
+    violations = []
+    with open(path, encoding="utf-8") as f:
+        nb = json.load(f)
+    for cell in nb.get("cells", []):
+        if cell.get("cell_type") == "code":
+            src = "".join(cell.get("source", []))
+            violations.extend(check_quotes_in_source(src, path))
+    return violations
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Verify double-quoted string literals.")
+    parser.add_argument(
+        "--include-notebooks",
+        action="store_true",
+        help="Also scan Jupyter notebooks (.ipynb files) for single-quoted strings.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    project_root = Path(".").resolve()
+    py_files = sorted(project_root.rglob("*.py"))
+    notebook_files = sorted(project_root.rglob("*.ipynb")) if args.include_notebooks else []
+
+    violations = []
+    for path in py_files + notebook_files:
+        if should_skip(path):
+            continue
+        violations.extend(check_file(path))
+
+    if violations:
+        print("\n".join(violations))
+        print(f"\n{len(violations)} violations found.")
+        return 1
+
+    print("All files use double quotes correctly.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/appendix-D/01_main-chapter-code/previous_chapters.py b/appendix-D/01_main-chapter-code/previous_chapters.py
index 58acd18..38b062f 100644
--- a/appendix-D/01_main-chapter-code/previous_chapters.py
+++ b/appendix-D/01_main-chapter-code/previous_chapters.py
@@ -73,7 +73,7 @@ class MultiHeadAttention(nn.Module):
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape
diff --git a/appendix-E/01_main-chapter-code/previous_chapters.py b/appendix-E/01_main-chapter-code/previous_chapters.py
index 248995b..2a3022f 100644
--- a/appendix-E/01_main-chapter-code/previous_chapters.py
+++ b/appendix-E/01_main-chapter-code/previous_chapters.py
@@ -80,7 +80,7 @@ class MultiHeadAttention(nn.Module):
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape
@@ -257,8 +257,8 @@ def assign(left, right):
 
 
 def load_weights_into_gpt(gpt, params):
-    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
-    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
+    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
+    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
 
     for b in range(len(params["blocks"])):
         q_w, k_w, v_w = np.split(
@@ -318,7 +318,7 @@ def load_weights_into_gpt(gpt, params):
 
 
 def text_to_token_ids(text, tokenizer):
-    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
+    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
     encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
     return encoded_tensor
 
diff --git a/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py b/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py
index 79b44c8..db11978 100644
--- a/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py
+++ b/ch02/02_bonus_bytepair-encoder/bpe_openai_gpt2.py
@@ -70,7 +70,7 @@ def get_pairs(word):
 
 
 class Encoder:
-    def __init__(self, encoder, bpe_merges, errors='replace'):
+    def __init__(self, encoder, bpe_merges, errors="replace"):
         self.encoder = encoder
         self.decoder = {v: k for k, v in self.encoder.items()}
         self.errors = errors  # how to handle errors in decoding
@@ -92,7 +92,7 @@ class Encoder:
             return token
 
         while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
             if bigram not in self.bpe_ranks:
                 break
             first, second = bigram
@@ -119,43 +119,43 @@ class Encoder:
                 break
             else:
                 pairs = get_pairs(word)
-        word = ' '.join(word)
+        word = " ".join(word)
         self.cache[token] = word
         return word
 
     def encode(self, text):
         bpe_tokens = []
         for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" "))
         return bpe_tokens
 
     def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        text = "".join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
         return text
 
 
 def get_encoder(model_name, models_dir):
-    with open(os.path.join(models_dir, model_name, 'encoder.json'), 'r') as f:
+    with open(os.path.join(models_dir, model_name, "encoder.json"), "r") as f:
         encoder = json.load(f)
-    with open(os.path.join(models_dir, model_name, 'vocab.bpe'), 'r', encoding="utf-8") as f:
+    with open(os.path.join(models_dir, model_name, "vocab.bpe"), "r", encoding="utf-8") as f:
         bpe_data = f.read()
-    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]]
+    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
     return Encoder(encoder=encoder, bpe_merges=bpe_merges)
 
 
 def download_vocab():
     # Modified code from
-    subdir = 'gpt2_model'
+    subdir = "gpt2_model"
     if not os.path.exists(subdir):
         os.makedirs(subdir)
-    subdir = subdir.replace('\\', '/')  # needed for Windows
+    subdir = subdir.replace("\\", "/")  # needed for Windows
 
-    for filename in ['encoder.json', 'vocab.bpe']:
+    for filename in ["encoder.json", "vocab.bpe"]:
         r = requests.get("https://openaipublic.blob.core.windows.net/gpt-2/models/117M/" + filename, stream=True)
 
-        with open(os.path.join(subdir, filename), 'wb') as f:
+        with open(os.path.join(subdir, filename), "wb") as f:
             file_size = int(r.headers["content-length"])
             chunk_size = 1000
             with tqdm(ncols=100, desc="Fetching " + filename, total=file_size, unit_scale=True) as pbar:
diff --git a/ch04/01_main-chapter-code/previous_chapters.py b/ch04/01_main-chapter-code/previous_chapters.py
index 027f2a7..716e7f8 100644
--- a/ch04/01_main-chapter-code/previous_chapters.py
+++ b/ch04/01_main-chapter-code/previous_chapters.py
@@ -60,7 +60,7 @@ class MultiHeadAttention(nn.Module):
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape
diff --git a/ch04/01_main-chapter-code/tests.py b/ch04/01_main-chapter-code/tests.py
index 8baa23f..6448ef2 100644
--- a/ch04/01_main-chapter-code/tests.py
+++ b/ch04/01_main-chapter-code/tests.py
@@ -33,8 +33,8 @@ def test_main(capsys):
     captured = capsys.readouterr()
 
     # Normalize line endings and strip trailing whitespace from each line
-    normalized_expected = '\n'.join(line.rstrip() for line in expected.splitlines())
-    normalized_output = '\n'.join(line.rstrip() for line in captured.out.splitlines())
+    normalized_expected = "\n".join(line.rstrip() for line in expected.splitlines())
+    normalized_output = "\n".join(line.rstrip() for line in captured.out.splitlines())
 
     # Compare normalized strings
     assert normalized_output == normalized_expected
diff --git a/ch05/01_main-chapter-code/previous_chapters.py b/ch05/01_main-chapter-code/previous_chapters.py
index 369e370..c599d16 100644
--- a/ch05/01_main-chapter-code/previous_chapters.py
+++ b/ch05/01_main-chapter-code/previous_chapters.py
@@ -71,7 +71,7 @@ class MultiHeadAttention(nn.Module):
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape
diff --git a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
index 4eafa3a..0d17d65 100644
--- a/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
+++ b/ch05/03_bonus_pretraining_on_gutenberg/prepare_dataset.py
@@ -43,7 +43,7 @@ def combine_files(file_paths, target_dir, max_size_mb=500, separator="<|endoftex
         content = strip_headers(content)
 
         # Regular expression to replace multiple blank lines with a single blank line
-        content = re.sub(r'\n\s*\n', '\n\n', content)
+        content = re.sub(r"\n\s*\n", "\n\n", content)
         estimated_size = len(content.encode("utf-8"))
 
         if current_size + estimated_size > max_size_mb * 1024 * 1024:
diff --git a/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py b/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py
index 92237a1..5bb7728 100644
--- a/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py
+++ b/ch05/03_bonus_pretraining_on_gutenberg/pretraining_simple.py
@@ -148,26 +148,26 @@ def train_model_simple(model, optimizer, device, n_epochs,
 
 if __name__ == "__main__":
 
-    parser = argparse.ArgumentParser(description='GPT Model Training Configuration')
+    parser = argparse.ArgumentParser(description="GPT Model Training Configuration")
 
-    parser.add_argument('--data_dir', type=str, default='gutenberg/data',
-                        help='Directory containing the training data')
-    parser.add_argument('--output_dir', type=str, default='model_checkpoints',
-                        help='Directory where the model checkpoints will be saved')
-    parser.add_argument('--n_epochs', type=int, default=1,
-                        help='Number of epochs to train the model')
-    parser.add_argument('--print_sample_iter', type=int, default=1000,
-                        help='Iterations between printing sample outputs')
-    parser.add_argument('--eval_freq', type=int, default=100,
-                        help='Frequency of evaluations during training')
-    parser.add_argument('--save_ckpt_freq', type=int, default=100_000,
-                        help='Frequency of saving model checkpoints during training')
-    parser.add_argument('--lr', type=float, default=5e-4,
-                        help='Learning rate for the optimizer')
-    parser.add_argument('--batch_size', type=int, default=4,
-                        help='Batch size for training')
-    parser.add_argument('--debug', type=bool, default=False,
-                        help='Uses a very small model for debugging purposes')
+    parser.add_argument("--data_dir", type=str, default="gutenberg/data",
+                        help="Directory containing the training data")
+    parser.add_argument("--output_dir", type=str, default="model_checkpoints",
+                        help="Directory where the model checkpoints will be saved")
+    parser.add_argument("--n_epochs", type=int, default=1,
+                        help="Number of epochs to train the model")
+    parser.add_argument("--print_sample_iter", type=int, default=1000,
+                        help="Iterations between printing sample outputs")
+    parser.add_argument("--eval_freq", type=int, default=100,
+                        help="Frequency of evaluations during training")
+    parser.add_argument("--save_ckpt_freq", type=int, default=100_000,
+                        help="Frequency of saving model checkpoints during training")
+    parser.add_argument("--lr", type=float, default=5e-4,
+                        help="Learning rate for the optimizer")
+    parser.add_argument("--batch_size", type=int, default=4,
+                        help="Batch size for training")
+    parser.add_argument("--debug", type=bool, default=False,
+                        help="Uses a very small model for debugging purposes")
 
     args = parser.parse_args()
 
diff --git a/ch05/05_bonus_hparam_tuning/hparam_search.py b/ch05/05_bonus_hparam_tuning/hparam_search.py
index e2e68e2..7e1a799 100644
--- a/ch05/05_bonus_hparam_tuning/hparam_search.py
+++ b/ch05/05_bonus_hparam_tuning/hparam_search.py
@@ -118,7 +118,7 @@ if __name__ == "__main__":
     print(f"Total hyperparameter configurations: {total_combinations}")
 
     # Placeholder for the best loss and best hyperparameters
-    best_val_loss = float('inf')
+    best_val_loss = float("inf")
     best_hparams = {}
 
     script_path = os.path.abspath(__file__)
diff --git a/ch05/07_gpt_to_llama/previous_chapters.py b/ch05/07_gpt_to_llama/previous_chapters.py
index 93411f5..787540b 100644
--- a/ch05/07_gpt_to_llama/previous_chapters.py
+++ b/ch05/07_gpt_to_llama/previous_chapters.py
@@ -38,7 +38,7 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
             # Keep only top_k values
             top_logits, _ = torch.topk(logits, top_k)
             min_val = top_logits[:, -1]
-            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)
+            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
 
         # New: Apply temperature scaling
         if temperature > 0.0:
diff --git a/ch05/08_memory_efficient_weight_loading/previous_chapters.py b/ch05/08_memory_efficient_weight_loading/previous_chapters.py
index 1fb5835..1657e96 100644
--- a/ch05/08_memory_efficient_weight_loading/previous_chapters.py
+++ b/ch05/08_memory_efficient_weight_loading/previous_chapters.py
@@ -29,7 +29,7 @@ class MultiHeadAttention(nn.Module):
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape
diff --git a/ch05/10_llm-training-speed/00_orig.py b/ch05/10_llm-training-speed/00_orig.py
index 1500e4e..b0f4e8d 100644
--- a/ch05/10_llm-training-speed/00_orig.py
+++ b/ch05/10_llm-training-speed/00_orig.py
@@ -426,7 +426,7 @@ def main(gpt_config, settings):
 
     if not os.path.exists(file_path):
         with urllib.request.urlopen(url) as response:
-            text_data = response.read().decode('utf-8')
+            text_data = response.read().decode("utf-8")
         with open(file_path, "w", encoding="utf-8") as file:
             file.write(text_data)
     else:
diff --git a/ch06/01_main-chapter-code/previous_chapters.py b/ch06/01_main-chapter-code/previous_chapters.py
index 9f3d8e8..ab6e746 100644
--- a/ch06/01_main-chapter-code/previous_chapters.py
+++ b/ch06/01_main-chapter-code/previous_chapters.py
@@ -72,7 +72,7 @@ class MultiHeadAttention(nn.Module):
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape
@@ -249,8 +249,8 @@ def assign(left, right):
 
 
 def load_weights_into_gpt(gpt, params):
-    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
-    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
+    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
+    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
 
     for b in range(len(params["blocks"])):
         q_w, k_w, v_w = np.split(
@@ -310,7 +310,7 @@ def load_weights_into_gpt(gpt, params):
 
 
 def text_to_token_ids(text, tokenizer):
-    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
+    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
     encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
     return encoded_tensor
 
diff --git a/ch06/02_bonus_additional-experiments/additional_experiments.py b/ch06/02_bonus_additional-experiments/additional_experiments.py
index 87e660f..5d518cb 100644
--- a/ch06/02_bonus_additional-experiments/additional_experiments.py
+++ b/ch06/02_bonus_additional-experiments/additional_experiments.py
@@ -446,7 +446,7 @@ if __name__ == "__main__":
     )
     parser.add_argument(
         "--average_embeddings",
-        action='store_true',
+        action="store_true",
         default=False,
         help=(
             "Average the output embeddings from all tokens instead of using"
@@ -480,7 +480,7 @@ if __name__ == "__main__":
     )
     parser.add_argument(
         "--no_padding",
-        action='store_true',
+        action="store_true",
         default=False,
         help=(
             "Disable padding, which means each example may have a different length."
@@ -517,7 +517,7 @@ if __name__ == "__main__":
     )
     parser.add_argument(
         "--disable_causal_mask",
-        action='store_true',
+        action="store_true",
         default=False,
         help=(
             "Disables the causal attention mask."
diff --git a/ch06/02_bonus_additional-experiments/previous_chapters.py b/ch06/02_bonus_additional-experiments/previous_chapters.py
index a4a9baa..8848afd 100644
--- a/ch06/02_bonus_additional-experiments/previous_chapters.py
+++ b/ch06/02_bonus_additional-experiments/previous_chapters.py
@@ -74,7 +74,7 @@ class MultiHeadAttention(nn.Module):
         self.dropout = nn.Dropout(dropout)
 
         if not disable_causal_mask:
-            self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+            self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
         self.disable_causal_mask = disable_causal_mask
 
     def forward(self, x):
@@ -255,8 +255,8 @@ def assign(left, right):
 
 
 def load_weights_into_gpt(gpt, params):
-    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
-    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
+    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
+    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
 
     for b in range(len(params["blocks"])):
         q_w, k_w, v_w = np.split(
@@ -328,7 +328,7 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
             # Keep only top_k values
             top_logits, _ = torch.topk(logits, top_k)
             min_val = top_logits[:, -1]
-            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)
+            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
 
         # New: Apply temperature scaling
         if temperature > 0.0:
diff --git a/ch06/03_bonus_imdb-classification/previous_chapters.py b/ch06/03_bonus_imdb-classification/previous_chapters.py
index 2bd2035..bb01cc5 100644
--- a/ch06/03_bonus_imdb-classification/previous_chapters.py
+++ b/ch06/03_bonus_imdb-classification/previous_chapters.py
@@ -73,7 +73,7 @@ class MultiHeadAttention(nn.Module):
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape
@@ -250,8 +250,8 @@ def assign(left, right):
 
 
 def load_weights_into_gpt(gpt, params):
-    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
-    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
+    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
+    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
 
     for b in range(len(params["blocks"])):
         q_w, k_w, v_w = np.split(
@@ -311,7 +311,7 @@ def load_weights_into_gpt(gpt, params):
 
 
 def text_to_token_ids(text, tokenizer):
-    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
+    encoded = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
     encoded_tensor = torch.tensor(encoded).unsqueeze(0)  # add batch dimension
     return encoded_tensor
 
diff --git a/ch06/03_bonus_imdb-classification/train_gpt.py b/ch06/03_bonus_imdb-classification/train_gpt.py
index b650634..4eaca10 100644
--- a/ch06/03_bonus_imdb-classification/train_gpt.py
+++ b/ch06/03_bonus_imdb-classification/train_gpt.py
@@ -261,7 +261,7 @@ if __name__ == "__main__":
     )
     parser.add_argument(
         "--average_embeddings",
-        action='store_true',
+        action="store_true",
         default=False,
         help=(
             "Average the output embeddings from all tokens instead of using"
diff --git a/ch07/01_main-chapter-code/previous_chapters.py b/ch07/01_main-chapter-code/previous_chapters.py
index 0aadf9e..2979679 100644
--- a/ch07/01_main-chapter-code/previous_chapters.py
+++ b/ch07/01_main-chapter-code/previous_chapters.py
@@ -77,7 +77,7 @@ class MultiHeadAttention(nn.Module):
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape
@@ -261,7 +261,7 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
             # Keep only top_k values
             top_logits, _ = torch.topk(logits, top_k)
             min_val = top_logits[:, -1]
-            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)
+            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
 
         # New: Apply temperature scaling
         if temperature > 0.0:
@@ -356,8 +356,8 @@ def assign(left, right):
 
 
 def load_weights_into_gpt(gpt, params):
-    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
-    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
+    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
+    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
 
     for b in range(len(params["blocks"])):
         q_w, k_w, v_w = np.split(
diff --git a/ch07/02_dataset-utilities/find-near-duplicates.py b/ch07/02_dataset-utilities/find-near-duplicates.py
index 45b2fce..6b62c26 100644
--- a/ch07/02_dataset-utilities/find-near-duplicates.py
+++ b/ch07/02_dataset-utilities/find-near-duplicates.py
@@ -34,7 +34,7 @@ def preprocess_text(text):
     # Lowercase the text
     text = text.lower()
     # Remove punctuation
-    text = re.sub(r'[^\w\s]', '', text)
+    text = re.sub(r"[^\w\s]", "", text)
     return text
 
 
@@ -50,7 +50,7 @@ def find_near_duplicates(json_data, threshold=0.75, key="instruction"):
         return {}, near_duplicates
 
     # Vectorize the text data
-    vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3))
+    vectorizer = TfidfVectorizer(stop_words=None, analyzer="char", ngram_range=(1, 3))
     tfidf_matrix = vectorizer.fit_transform(text)
 
     # Compute cosine similarity between each pair of entries
@@ -84,7 +84,7 @@ def find_print_and_remove_near_duplicates(json_data, remove_duplicates=False, th
             json_data, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold)
         else:
             _, near_duplicates = find_near_duplicates(json_data, key=key, threshold=threshold)
-        separator = 50 * '='
+        separator = 50 * "="
         print(f"\n\n{separator}\nSearching '{key}' for duplicates ...\n{separator}")
         if not near_duplicates:
             print("No duplicates found")
@@ -114,7 +114,7 @@ if __name__ == "__main__":
     )
     parser.add_argument(
         "--remove_duplicates",
-        action='store_true',
+        action="store_true",
         default=False,
         help=(
             "Removes duplicates based on the 'input' or 'output' keys "
diff --git a/ch07/04_preference-tuning-with-dpo/previous_chapters.py b/ch07/04_preference-tuning-with-dpo/previous_chapters.py
index 829e92c..c3783ce 100644
--- a/ch07/04_preference-tuning-with-dpo/previous_chapters.py
+++ b/ch07/04_preference-tuning-with-dpo/previous_chapters.py
@@ -77,7 +77,7 @@ class MultiHeadAttention(nn.Module):
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape
@@ -261,7 +261,7 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
             # Keep only top_k values
             top_logits, _ = torch.topk(logits, top_k)
             min_val = top_logits[:, -1]
-            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)
+            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
 
         # New: Apply temperature scaling
         if temperature > 0.0:
@@ -357,8 +357,8 @@ def assign(left, right):
 
 
 def load_weights_into_gpt(gpt, params):
-    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
-    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
+    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
+    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
 
     for b in range(len(params["blocks"])):
         q_w, k_w, v_w = np.split(
diff --git a/pkg/llms_from_scratch/ch03.py b/pkg/llms_from_scratch/ch03.py
index e6ca561..f458d00 100644
--- a/pkg/llms_from_scratch/ch03.py
+++ b/pkg/llms_from_scratch/ch03.py
@@ -59,7 +59,7 @@ class CausalAttention(nn.Module):
         self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.dropout = nn.Dropout(dropout)  # New
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1)) # New
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape  # New batch dimension b
@@ -109,7 +109,7 @@ class MultiHeadAttention(nn.Module):
         self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
         self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
         self.dropout = nn.Dropout(dropout)
-        self.register_buffer('mask', torch.triu(torch.ones(context_length, context_length), diagonal=1))
+        self.register_buffer("mask", torch.triu(torch.ones(context_length, context_length), diagonal=1))
 
     def forward(self, x):
         b, num_tokens, d_in = x.shape
diff --git a/pkg/llms_from_scratch/ch05.py b/pkg/llms_from_scratch/ch05.py
index afca998..b91baa7 100644
--- a/pkg/llms_from_scratch/ch05.py
+++ b/pkg/llms_from_scratch/ch05.py
@@ -30,7 +30,7 @@ def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=No
             # Keep only top_k values
             top_logits, _ = torch.topk(logits, top_k)
             min_val = top_logits[:, -1]
-            logits = torch.where(logits < min_val, torch.tensor(float('-inf')).to(logits.device), logits)
+            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)
 
         # New: Apply temperature scaling
         if temperature > 0.0:
@@ -125,8 +125,8 @@ def assign(left, right):
 
 
 def load_weights_into_gpt(gpt, params):
-    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
-    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
+    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params["wpe"])
+    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params["wte"])
 
     for b in range(len(params["blocks"])):
         q_w, k_w, v_w = np.split(
diff --git a/pkg/llms_from_scratch/tests/test_qwen3.py b/pkg/llms_from_scratch/tests/test_qwen3.py
index 02d9c31..68dd901 100644
--- a/pkg/llms_from_scratch/tests/test_qwen3.py
+++ b/pkg/llms_from_scratch/tests/test_qwen3.py
@@ -110,7 +110,7 @@ def test_dummy_qwen3_moe_forward(dummy_cfg_moe, dummy_input):
     out = model(dummy_input)
     assert out.shape == (1, dummy_input.size(1), dummy_cfg_moe["vocab_size"]), \
         f"Expected shape (1, seq_len, vocab_size), got {out.shape}"
-    assert any(hasattr(block.ff, 'gate') for block in model.trf_blocks), \
+    assert any(hasattr(block.ff, "gate") for block in model.trf_blocks), \
         "Expected MoEFeedForward in at least one transformer block"