From 47b3ff15ec03a55d153682084b7063100c9b9bdf Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Tue, 14 May 2024 20:35:50 -0400 Subject: [PATCH] improve bonus code in chapter 06 --- ch06/03_bonus_imdb-classification/README.md | 2 +- .../download-prepare-dataset.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/ch06/03_bonus_imdb-classification/README.md b/ch06/03_bonus_imdb-classification/README.md index ede1721..4da6bec 100644 --- a/ch06/03_bonus_imdb-classification/README.md +++ b/ch06/03_bonus_imdb-classification/README.md @@ -14,7 +14,7 @@ pip install -r requirements-extra.txt The codes are using the 50k movie reviews from IMDb ([dataset source](https://ai.stanford.edu/~amaas/data/sentiment/)) to predict whether a movie review is positive or negative. -Run the following code to create the `train.csv`, `val.csv`, and `test.csv` datasets: +Run the following code to create the `train.csv`, `validation.csv`, and `test.csv` datasets: ```bash python download-prepare-dataset.py diff --git a/ch06/03_bonus_imdb-classification/download-prepare-dataset.py b/ch06/03_bonus_imdb-classification/download-prepare-dataset.py index e3e60b4..f5ab61c 100644 --- a/ch06/03_bonus_imdb-classification/download-prepare-dataset.py +++ b/ch06/03_bonus_imdb-classification/download-prepare-dataset.py @@ -19,7 +19,8 @@ def reporthook(count, block_size, total_size): duration = time.time() - start_time progress_size = int(count * block_size) percent = count * block_size * 100 / total_size - speed = progress_size / (1024**2 * duration) + + speed = int(progress_size / (1024 * duration)) if duration else 0 sys.stdout.write( f"\r{int(percent)}% | {progress_size / (1024**2):.2f} MB " f"| {speed:.2f} MB/s | {duration:.2f} sec elapsed" @@ -32,6 +33,7 @@ def download_and_extract_dataset(dataset_url, target_file, directory): if os.path.exists(target_file): os.remove(target_file) urllib.request.urlretrieve(dataset_url, target_file, reporthook) + print("\nExtracting dataset ...") with tarfile.open(target_file, "r:gz") as tar: tar.extractall() else: @@ -74,6 +76,9 @@ def partition_and_save(df, sizes=(35000, 5000, 10000)): if __name__ == "__main__": dataset_url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" + print("Downloading dataset ...") download_and_extract_dataset(dataset_url, "aclImdb_v1.tar.gz", "aclImdb") + print("Creating data frames ...") df = load_dataset_to_dataframe() + print("Partitioning and saving data frames ...") partition_and_save(df)