autogen/notebook/flaml_finetune_transformer.ipynb

1601 lines
136 KiB
Plaintext
Raw Normal View History

2021-02-06 16:24:38 -08:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This notebook uses flaml to finetune a transformer model from Huggingface transformers library.\n",
2021-02-06 16:24:38 -08:00
"\n",
2021-02-13 10:43:11 -08:00
"**Requirements.** This notebook has additional requirements:"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 1,
"metadata": {
"tags": []
},
2021-02-13 10:43:11 -08:00
"outputs": [],
"source": [
2021-05-08 02:50:50 +00:00
"#!pip install torch transformers datasets ipywidgets flaml[blendsearch,ray];"
2021-02-06 16:24:38 -08:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Tokenizer"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 2,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoTokenizer"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 3,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"MODEL_CHECKPOINT = \"distilbert-base-uncased\""
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 4,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 5,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
2021-02-06 16:24:38 -08:00
"data": {
"text/plain": [
"{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}"
]
},
"metadata": {},
2021-05-08 02:50:50 +00:00
"execution_count": 5
2021-02-06 16:24:38 -08:00
}
],
"source": [
"tokenizer(\"this is a test\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 6,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"TASK = \"cola\""
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 7,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"import datasets"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 8,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
2021-02-06 16:24:38 -08:00
"text": [
2021-05-08 02:50:50 +00:00
"Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
2021-02-06 16:24:38 -08:00
]
}
],
"source": [
"raw_dataset = datasets.load_dataset(\"glue\", TASK)"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 9,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"# define tokenization function used to process data\n",
"COLUMN_NAME = \"sentence\"\n",
"def tokenize(examples):\n",
" return tokenizer(examples[COLUMN_NAME], truncation=True)"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 10,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [
2021-05-08 02:50:50 +00:00
{
"output_type": "display_data",
"data": {
"text/plain": "HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "ecc66e6795f848e0a41e6cf1ce37bdf2"
}
},
"metadata": {}
},
2021-02-06 16:24:38 -08:00
{
"output_type": "stream",
2021-05-08 02:50:50 +00:00
"name": "stdout",
"text": [
"\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "2d33fc70b80b403080ad8c0e77ed1891"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
2021-02-06 16:24:38 -08:00
"text": [
2021-05-08 02:50:50 +00:00
"\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))",
"application/vnd.jupyter.widget-view+json": {
"version_major": 2,
"version_minor": 0,
"model_id": "d2ab3feb1a354187abb2dded0ead404f"
}
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n"
2021-02-06 16:24:38 -08:00
]
}
],
"source": [
"encoded_dataset = raw_dataset.map(tokenize, batched=True)"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 11,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
2021-02-06 16:24:38 -08:00
"data": {
"text/plain": [
"{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
" 'idx': 0,\n",
" 'input_ids': [101,\n",
" 2256,\n",
" 2814,\n",
" 2180,\n",
" 1005,\n",
" 1056,\n",
" 4965,\n",
" 2023,\n",
" 4106,\n",
" 1010,\n",
" 2292,\n",
" 2894,\n",
" 1996,\n",
" 2279,\n",
" 2028,\n",
" 2057,\n",
" 16599,\n",
" 1012,\n",
" 102],\n",
" 'label': 1,\n",
" 'sentence': \"Our friends won't buy this analysis, let alone the next one we propose.\"}"
]
},
"metadata": {},
2021-05-08 02:50:50 +00:00
"execution_count": 11
2021-02-06 16:24:38 -08:00
}
],
"source": [
"encoded_dataset[\"train\"][0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Model"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 12,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"from transformers import AutoModelForSequenceClassification"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 13,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
2021-02-06 16:24:38 -08:00
"- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"NUM_LABELS = 2\n",
"model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_LABELS)"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 14,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
2021-02-06 16:24:38 -08:00
"data": {
"text/plain": [
"DistilBertForSequenceClassification(\n",
" (distilbert): DistilBertModel(\n",
" (embeddings): Embeddings(\n",
" (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
" (position_embeddings): Embedding(512, 768)\n",
" (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" )\n",
" (transformer): Transformer(\n",
" (layer): ModuleList(\n",
" (0): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" (1): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" (2): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" (3): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" (4): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" (5): TransformerBlock(\n",
" (attention): MultiHeadSelfAttention(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
" )\n",
" (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" (ffn): FFN(\n",
" (dropout): Dropout(p=0.1, inplace=False)\n",
" (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
" (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
" )\n",
" (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
" )\n",
" )\n",
" )\n",
" )\n",
" (pre_classifier): Linear(in_features=768, out_features=768, bias=True)\n",
" (classifier): Linear(in_features=768, out_features=2, bias=True)\n",
" (dropout): Dropout(p=0.2, inplace=False)\n",
")"
]
},
"metadata": {},
2021-05-08 02:50:50 +00:00
"execution_count": 14
2021-02-06 16:24:38 -08:00
}
],
"source": [
"model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Metric"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 15,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"metric = datasets.load_metric(\"glue\", TASK)"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 16,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [
{
"output_type": "execute_result",
2021-02-06 16:24:38 -08:00
"data": {
"text/plain": [
"Metric(name: \"glue\", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: \"\"\"\n",
"Compute GLUE evaluation metric associated to each GLUE dataset.\n",
"Args:\n",
" predictions: list of predictions to score.\n",
2021-02-06 16:24:38 -08:00
" Each translation should be tokenized into a list of tokens.\n",
" references: list of lists of references for each translation.\n",
" Each reference should be tokenized into a list of tokens.\n",
"Returns: depending on the GLUE subset, one or several of:\n",
" \"accuracy\": Accuracy\n",
" \"f1\": F1 score\n",
2021-02-06 16:24:38 -08:00
" \"pearson\": Pearson Correlation\n",
" \"spearmanr\": Spearman Correlation\n",
" \"matthews_correlation\": Matthew Correlation\n",
"Examples:\n",
"\n",
" >>> glue_metric = datasets.load_metric('glue', 'sst2') # 'sst2' or any of [\"mnli\", \"mnli_mismatched\", \"mnli_matched\", \"qnli\", \"rte\", \"wnli\", \"hans\"]\n",
" >>> references = [0, 1]\n",
" >>> predictions = [0, 1]\n",
" >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
" >>> print(results)\n",
" {'accuracy': 1.0}\n",
"\n",
" >>> glue_metric = datasets.load_metric('glue', 'mrpc') # 'mrpc' or 'qqp'\n",
" >>> references = [0, 1]\n",
" >>> predictions = [0, 1]\n",
" >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
" >>> print(results)\n",
" {'accuracy': 1.0, 'f1': 1.0}\n",
"\n",
" >>> glue_metric = datasets.load_metric('glue', 'stsb')\n",
" >>> references = [0., 1., 2., 3., 4., 5.]\n",
" >>> predictions = [0., 1., 2., 3., 4., 5.]\n",
" >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
" >>> print({\"pearson\": round(results[\"pearson\"], 2), \"spearmanr\": round(results[\"spearmanr\"], 2)})\n",
" {'pearson': 1.0, 'spearmanr': 1.0}\n",
"\n",
" >>> glue_metric = datasets.load_metric('glue', 'cola')\n",
" >>> references = [0, 1]\n",
" >>> predictions = [0, 1]\n",
" >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
" >>> print(results)\n",
" {'matthews_correlation': 1.0}\n",
2021-02-06 16:24:38 -08:00
"\"\"\", stored examples: 0)"
]
},
"metadata": {},
2021-05-08 02:50:50 +00:00
"execution_count": 16
2021-02-06 16:24:38 -08:00
}
],
"source": [
"metric"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 17,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
2021-02-06 16:24:38 -08:00
"def compute_metrics(eval_pred):\n",
" predictions, labels = eval_pred\n",
" predictions = np.argmax(predictions, axis=1)\n",
" return metric.compute(predictions=predictions, references=labels)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training (aka Finetuning)"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 18,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"from transformers import Trainer\n",
"from transformers import TrainingArguments"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 19,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"args = TrainingArguments(\n",
" output_dir='output',\n",
" do_eval=True,\n",
")"
2021-02-06 16:24:38 -08:00
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 20,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"trainer = Trainer(\n",
" model=model,\n",
" args=args,\n",
" train_dataset=encoded_dataset[\"train\"],\n",
" eval_dataset=encoded_dataset[\"validation\"],\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
")"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 21,
2021-02-06 16:24:38 -08:00
"metadata": {},
2021-05-08 02:50:50 +00:00
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n warnings.warn('Was asked to gather along dimension 0, but all '\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "\n <div>\n <style>\n /* Turns off some styling */\n progress {\n /* gets rid of default border in Firefox and Opera. */\n border: none;\n /* Needs to be in here for Safari polyfill so background images work as expected. */\n background-size: auto;\n }\n </style>\n \n <progress value='2' max='804' style='width:300px; height:20px; vertical-align: middle;'></progress>\n [ 2/804 : < :, Epoch 0.00/3]\n </div>\n <table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: left;\">\n <th>Step</th>\n <th>Training Loss</th>\n </tr>\n </thead>\n <tbody>\n </tbody>\n</table><p>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
"/home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n warnings.warn('Was asked to gather along dimension 0, but all '\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"TrainOutput(global_step=804, training_loss=0.3209413462017306, metrics={'train_runtime': 115.5328, 'train_samples_per_second': 6.959, 'total_flos': 238363718990580.0, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 2336600064, 'init_mem_gpu_alloc_delta': 268953088, 'init_mem_cpu_peaked_delta': 257929216, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 2381066240, 'train_mem_gpu_alloc_delta': 806788096, 'train_mem_cpu_peaked_delta': 186974208, 'train_mem_gpu_peaked_delta': 550790144})"
]
},
"metadata": {},
"execution_count": 21
}
],
2021-02-06 16:24:38 -08:00
"source": [
"trainer.train()"
]
},
{
"source": [
"## Hyperparameter Optimization\n",
"\n",
"`flaml.tune` is a module for economical hyperparameter tuning. It frees users from manually tuning many hyperparameters for a software, such as machine learning training procedures. \n",
"The API is compatible with ray tune.\n",
"\n",
"### Step 1. Define training method\n",
"\n",
"We define a function `train_distilbert(config: dict)` that accepts a hyperparameter configuration dict `config`. The specific configs will be generated by flaml's search algorithm in a given search space.\n"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 22,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"import flaml\n",
"\n",
"def train_distilbert(config: dict):\n",
"\n",
" # Load CoLA dataset and apply tokenizer\n",
" cola_raw = datasets.load_dataset(\"glue\", TASK)\n",
2021-02-06 16:24:38 -08:00
" cola_encoded = cola_raw.map(tokenize, batched=True)\n",
" train_dataset, eval_dataset = cola_encoded[\"train\"], cola_encoded[\"validation\"]\n",
"\n",
" model = AutoModelForSequenceClassification.from_pretrained(\n",
" MODEL_CHECKPOINT, num_labels=NUM_LABELS\n",
" )\n",
"\n",
" metric = datasets.load_metric(\"glue\", TASK)\n",
" def compute_metrics(eval_pred):\n",
" predictions, labels = eval_pred\n",
" predictions = np.argmax(predictions, axis=1)\n",
" return metric.compute(predictions=predictions, references=labels)\n",
2021-02-06 16:24:38 -08:00
"\n",
" training_args = TrainingArguments(\n",
" output_dir='.',\n",
" do_eval=False,\n",
" disable_tqdm=True,\n",
" logging_steps=20000,\n",
" save_total_limit=0,\n",
2021-02-06 16:24:38 -08:00
" **config,\n",
" )\n",
"\n",
" trainer = Trainer(\n",
" model,\n",
" training_args,\n",
" train_dataset=train_dataset,\n",
" eval_dataset=eval_dataset,\n",
" tokenizer=tokenizer,\n",
" compute_metrics=compute_metrics,\n",
" )\n",
"\n",
" # train model\n",
" trainer.train()\n",
"\n",
" # evaluate model\n",
" eval_output = trainer.evaluate()\n",
"\n",
" # report the metric to optimize\n",
" flaml.tune.report(\n",
" loss=eval_output[\"eval_loss\"],\n",
" matthews_correlation=eval_output[\"eval_matthews_correlation\"],\n",
" )"
2021-02-06 16:24:38 -08:00
]
},
{
"source": [
"### Step 2. Define the search\n",
"\n",
"We are now ready to define our search. This includes:\n",
"\n",
"- The `search_space` for our hyperparameters\n",
"- The metric and the mode ('max' or 'min') for optimization\n",
"- The constraints (`n_cpus`, `n_gpus`, `num_samples`, and `time_budget_s`)"
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 23,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"max_num_epoch = 64\n",
2021-02-06 16:24:38 -08:00
"search_space = {\n",
" # You can mix constants with search space objects.\n",
" \"num_train_epochs\": flaml.tune.loguniform(1, max_num_epoch),\n",
" \"learning_rate\": flaml.tune.loguniform(1e-6, 1e-4),\n",
" \"adam_epsilon\": flaml.tune.loguniform(1e-9, 1e-7),\n",
" \"adam_beta1\": flaml.tune.uniform(0.8, 0.99),\n",
" \"adam_beta2\": flaml.tune.loguniform(98e-2, 9999e-4),\n",
"}"
2021-02-06 16:24:38 -08:00
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 24,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [],
"source": [
"# optimization objective\n",
"HP_METRIC, MODE = \"matthews_correlation\", \"max\"\n",
"\n",
"# resources\n",
"num_cpus = 4\n",
"num_gpus = 4\n",
2021-02-06 16:24:38 -08:00
"\n",
"# constraints\n",
"num_samples = -1 # number of trials, -1 means unlimited\n",
"time_budget_s = 3600 # time budget in seconds"
2021-02-06 16:24:38 -08:00
]
},
{
"source": [
"### Step 3. Launch with `flaml.tune.run`\n",
"\n",
2021-02-13 10:43:11 -08:00
"We are now ready to launch the tuning using `flaml.tune.run`:"
2021-02-06 16:24:38 -08:00
],
"cell_type": "markdown",
"metadata": {}
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 25,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
2021-05-08 02:50:50 +00:00
"2021-05-07 02:35:57,130\tINFO services.py:1172 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n",
"2021-05-07 02:35:58,044\tWARNING function_runner.py:540 -- Function checkpointing is disabled. This may result in unexpected behavior when using checkpointing features or certain schedulers. To enable, set the train function arguments to be `func(config, checkpoint_dir=None)`.\n",
"Tuning started...\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 26.0/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 1/infinite (1 RUNNING)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
2021-05-08 02:50:50 +00:00
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 78%|███████▊ | 7/9 [00:00<00:00, 62.07ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 40.87ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 107.60ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 105.70ba/s]\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m {'train_runtime': 45.5778, 'train_samples_per_second': 5.88, 'epoch': 1.0}\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m {'eval_loss': 0.5879864692687988, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.7063, 'eval_samples_per_second': 611.265, 'epoch': 1.0}\n",
"Trial train_distilbert_a0c303d0 reported loss=0.5879864692687988,matthews_correlation=0.0 with parameters={'num_train_epochs': 1, 'learning_rate': 5.61151641533451e-06, 'adam_epsilon': 7.969454818643929e-08, 'adam_beta1': 0.9390788489441669, 'adam_beta2': 0.99186521389353}.\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars\n",
"\u001b[2m\u001b[36m(pid=886303)\u001b[0m mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "== Status ==<br>Memory usage on this node: 30.9/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 2/infinite (1 PENDING, 1 RUNNING)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Trial train_distilbert_a0c303d0 completed. Last result: loss=0.5879864692687988,matthews_correlation=0.0\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 78%|███████▊ | 7/9 [00:00<00:00, 61.83ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 41.19ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 108.40ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 104.85ba/s]\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m {'train_runtime': 62.1006, 'train_samples_per_second': 6.248, 'epoch': 1.45}\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m {'eval_loss': 0.6030182838439941, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.7026, 'eval_samples_per_second': 612.584, 'epoch': 1.45}\n",
"Trial train_distilbert_a0c303d1 reported loss=0.6030182838439941,matthews_correlation=0.0 with parameters={'num_train_epochs': 1.444265389543504, 'learning_rate': 2.051338263087453e-06, 'adam_epsilon': 2.0511104188434023e-09, 'adam_beta1': 0.8110358863119579, 'adam_beta2': 0.997213662958137}.\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars\n",
"\u001b[2m\u001b[36m(pid=886302)\u001b[0m mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "== Status ==<br>Memory usage on this node: 31.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 3/infinite (1 PENDING, 1 RUNNING, 1 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Trial train_distilbert_a0c303d1 completed. Last result: loss=0.6030182838439941,matthews_correlation=0.0\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 56.45ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 39.00ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 112.51ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 106.76ba/s]\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m {'train_runtime': 44.0366, 'train_samples_per_second': 6.086, 'epoch': 1.0}\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m {'eval_loss': 0.5865175724029541, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.6974, 'eval_samples_per_second': 614.462, 'epoch': 1.0}\n",
"Trial train_distilbert_c39b2ef0 reported loss=0.5865175724029541,matthews_correlation=0.0 with parameters={'num_train_epochs': 1.0, 'learning_rate': 5.265428651017862e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.9093950363089345, 'adam_beta2': 0.9937145453421068}.\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars\n",
"\u001b[2m\u001b[36m(pid=886305)\u001b[0m mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.4/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 4/infinite (1 PENDING, 1 RUNNING, 2 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_c39b2ef0 completed. Last result: loss=0.5865175724029541,matthews_correlation=0.0\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 59.27ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 40.35ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 114.16ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 92.98ba/s]\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m {'train_runtime': 47.2831, 'train_samples_per_second': 5.837, 'epoch': 1.03}\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m {'eval_loss': 0.5813134908676147, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.7102, 'eval_samples_per_second': 609.872, 'epoch': 1.03}\n",
"Trial train_distilbert_f00776e2 reported loss=0.5813134908676147,matthews_correlation=0.0 with parameters={'num_train_epochs': 1.027230096840913, 'learning_rate': 5.980351945986672e-06, 'adam_epsilon': 4.3894312769297216e-08, 'adam_beta1': 0.9687626615793994, 'adam_beta2': 0.9900193241041526}.\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars\n",
"\u001b[2m\u001b[36m(pid=886304)\u001b[0m mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.7/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 5/infinite (1 PENDING, 1 RUNNING, 3 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_f00776e2 completed. Last result: loss=0.5813134908676147,matthews_correlation=0.0\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 58.13ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 39.40ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 92.35ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 106.15ba/s]\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m {'train_runtime': 44.3622, 'train_samples_per_second': 6.041, 'epoch': 1.0}\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m {'eval_loss': 0.5855756998062134, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.6326, 'eval_samples_per_second': 638.84, 'epoch': 1.0}\n",
"Trial train_distilbert_11ab3900 reported loss=0.5855756998062134,matthews_correlation=0.0 with parameters={'num_train_epochs': 1.0, 'learning_rate': 5.066078755222997e-06, 'adam_epsilon': 7.092964136440028e-08, 'adam_beta1': 0.9621979585561743, 'adam_beta2': 0.9918380608681451}.\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/sklearn/metrics/_classification.py:873: RuntimeWarning: invalid value encountered in double_scalars\n",
"\u001b[2m\u001b[36m(pid=892770)\u001b[0m mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 32.0/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 6/infinite (1 PENDING, 1 RUNNING, 4 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_11ab3900 completed. Last result: loss=0.5855756998062134,matthews_correlation=0.0\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 55.93ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 40.18ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 104.47ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 102.67ba/s]\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m {'train_runtime': 88.0772, 'train_samples_per_second': 6.562, 'epoch': 2.16}\n",
"\u001b[2m\u001b[36m(pid=897725)\u001b[0m {'eval_loss': 0.5316324830055237, 'eval_matthews_correlation': 0.38889272875750597, 'eval_runtime': 1.6116, 'eval_samples_per_second': 647.165, 'epoch': 2.16}\n",
"Trial train_distilbert_353025b6 reported loss=0.5316324830055237,matthews_correlation=0.38889272875750597 with parameters={'num_train_epochs': 2.1544304289135847, 'learning_rate': 6.215678437115527e-06, 'adam_epsilon': 8.954255073716448e-08, 'adam_beta1': 0.9159597393321596, 'adam_beta2': 0.9918923676622686}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 30.9/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 7/infinite (1 PENDING, 1 RUNNING, 5 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_353025b6 completed. Last result: loss=0.5316324830055237,matthews_correlation=0.38889272875750597\n",
"\u001b[2m\u001b[36m(pid=907288)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 78%|███████▊ | 7/9 [00:00<00:00, 60.41ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 40.27ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 107.10ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 93.66ba/s]\n",
"\u001b[2m\u001b[36m(pid=907288)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=907288)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=907288)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=907288)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=907288)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=907288)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=907288)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=907288)\u001b[0m {'train_runtime': 45.7159, 'train_samples_per_second': 5.862, 'epoch': 1.0}\n",
"\u001b[2m\u001b[36m(pid=907288)\u001b[0m {'eval_loss': 0.5385054349899292, 'eval_matthews_correlation': 0.2805581766595423, 'eval_runtime': 1.6966, 'eval_samples_per_second': 614.762, 'epoch': 1.0}\n",
"Trial train_distilbert_5728a1de reported loss=0.5385054349899292,matthews_correlation=0.2805581766595423 with parameters={'num_train_epochs': 1.0, 'learning_rate': 1.0090242363457245e-05, 'adam_epsilon': 1e-07, 'adam_beta1': 0.9269328215628503, 'adam_beta2': 0.9931456651827125}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.3/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 8/infinite (1 PENDING, 1 RUNNING, 6 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_5728a1de completed. Last result: loss=0.5385054349899292,matthews_correlation=0.2805581766595423\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 78%|███████▊ | 7/9 [00:00<00:00, 60.71ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 40.09ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 96.21ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 89.91ba/s]\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m {'train_runtime': 111.6359, 'train_samples_per_second': 6.557, 'epoch': 2.73}\n",
"\u001b[2m\u001b[36m(pid=908756)\u001b[0m {'eval_loss': 0.5391769409179688, 'eval_matthews_correlation': 0.3272948213494272, 'eval_runtime': 1.7214, 'eval_samples_per_second': 605.887, 'epoch': 2.73}\n",
"Trial train_distilbert_9394c2e2 reported loss=0.5391769409179688,matthews_correlation=0.3272948213494272 with parameters={'num_train_epochs': 2.729346084540195, 'learning_rate': 3.456743686220407e-06, 'adam_epsilon': 4.499745343526232e-08, 'adam_beta1': 0.9281057667134762, 'adam_beta2': 0.9906135322351715}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.6/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 9/infinite (1 PENDING, 1 RUNNING, 7 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_9394c2e2 completed. Last result: loss=0.5391769409179688,matthews_correlation=0.3272948213494272\n",
"\u001b[2m\u001b[36m(pid=912284)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 78%|███████▊ | 7/9 [00:00<00:00, 67.17ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 43.92ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 92.79ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 113.54ba/s]\n",
"\u001b[2m\u001b[36m(pid=912284)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=912284)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=912284)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=912284)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=912284)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=912284)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=912284)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=912284)\u001b[0m {'train_runtime': 66.0391, 'train_samples_per_second': 6.481, 'epoch': 1.6}\n",
"\u001b[2m\u001b[36m(pid=912284)\u001b[0m {'eval_loss': 0.5275164842605591, 'eval_matthews_correlation': 0.37917684067701946, 'eval_runtime': 1.8839, 'eval_samples_per_second': 553.629, 'epoch': 1.6}\n",
"Trial train_distilbert_b6543fec reported loss=0.5275164842605591,matthews_correlation=0.37917684067701946 with parameters={'num_train_epochs': 1.5953752206236405, 'learning_rate': 7.011758405605033e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.8768956788597737, 'adam_beta2': 0.9920981141573957}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.9/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 10/infinite (1 PENDING, 1 RUNNING, 8 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_b6543fec completed. Last result: loss=0.5275164842605591,matthews_correlation=0.37917684067701946\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 59.49ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 39.12ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 106.86ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 110.39ba/s]\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m {'train_runtime': 116.5658, 'train_samples_per_second': 6.692, 'epoch': 2.91}\n",
"\u001b[2m\u001b[36m(pid=914582)\u001b[0m {'eval_loss': 0.5162246823310852, 'eval_matthews_correlation': 0.417156672319181, 'eval_runtime': 1.6762, 'eval_samples_per_second': 622.252, 'epoch': 2.91}\n",
"Trial train_distilbert_0071f998 reported loss=0.5162246823310852,matthews_correlation=0.417156672319181 with parameters={'num_train_epochs': 2.9093911031251687, 'learning_rate': 5.509981405340389e-06, 'adam_epsilon': 7.397757073991268e-08, 'adam_beta1': 0.9550237998045454, 'adam_beta2': 0.9916866638359256}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.0/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 11/infinite (1 PENDING, 1 RUNNING, 9 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_0071f998 completed. Last result: loss=0.5162246823310852,matthews_correlation=0.417156672319181\n",
"\u001b[2m\u001b[36m(pid=918301)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 53.62ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 35.94ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 104.02ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 107.63ba/s]\n",
"\u001b[2m\u001b[36m(pid=918301)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=918301)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=918301)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=918301)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=918301)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=918301)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=918301)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=918301)\u001b[0m {'train_runtime': 64.0869, 'train_samples_per_second': 6.413, 'epoch': 1.53}\n",
"\u001b[2m\u001b[36m(pid=918301)\u001b[0m {'eval_loss': 0.5516289472579956, 'eval_matthews_correlation': 0.06558874629318973, 'eval_runtime': 1.7231, 'eval_samples_per_second': 605.3, 'epoch': 1.53}\n",
"Trial train_distilbert_2f830be6 reported loss=0.5516289472579956,matthews_correlation=0.06558874629318973 with parameters={'num_train_epochs': 1.533382973758465, 'learning_rate': 4.376455192665657e-06, 'adam_epsilon': 7.612697083213253e-08, 'adam_beta1': 0.8869305298731159, 'adam_beta2': 0.9896280392288217}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 12/infinite (1 PENDING, 1 RUNNING, 10 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_2f830be6 completed. Last result: loss=0.5516289472579956,matthews_correlation=0.06558874629318973\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 59.90ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 39.68ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 84.51ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 86.07ba/s]\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m {'train_runtime': 164.3921, 'train_samples_per_second': 6.667, 'epoch': 4.09}\n",
"\u001b[2m\u001b[36m(pid=920414)\u001b[0m {'eval_loss': 0.523731529712677, 'eval_matthews_correlation': 0.45354879777314566, 'eval_runtime': 1.6345, 'eval_samples_per_second': 638.117, 'epoch': 4.09}\n",
"Trial train_distilbert_7ce03f12 reported loss=0.523731529712677,matthews_correlation=0.45354879777314566 with parameters={'num_train_epochs': 4.087746394379008, 'learning_rate': 7.82557368974717e-06, 'adam_epsilon': 8.701436966404051e-08, 'adam_beta1': 0.9840530092635891, 'adam_beta2': 0.9939557025262034}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.7/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 13/infinite (1 PENDING, 1 RUNNING, 11 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_7ce03f12 completed. Last result: loss=0.523731529712677,matthews_correlation=0.45354879777314566\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 78%|███████▊ | 7/9 [00:00<00:00, 63.59ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 41.23ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 102.78ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 107.25ba/s]\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m {'train_runtime': 135.9594, 'train_samples_per_second': 6.708, 'epoch': 3.4}\n",
"\u001b[2m\u001b[36m(pid=925520)\u001b[0m {'eval_loss': 0.5112878680229187, 'eval_matthews_correlation': 0.4508496945113286, 'eval_runtime': 1.553, 'eval_samples_per_second': 671.602, 'epoch': 3.4}\n",
"Trial train_distilbert_aaab0508 reported loss=0.5112878680229187,matthews_correlation=0.4508496945113286 with parameters={'num_train_epochs': 3.402431917111274, 'learning_rate': 8.91979274640535e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.9407072906865396, 'adam_beta2': 0.993946172640627}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 32.3/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 14/infinite (1 PENDING, 1 RUNNING, 12 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_aaab0508 completed. Last result: loss=0.5112878680229187,matthews_correlation=0.4508496945113286\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 57.22ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 38.97ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 90.88ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 89.31ba/s]\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m {'train_runtime': 142.1621, 'train_samples_per_second': 6.591, 'epoch': 3.5}\n",
"\u001b[2m\u001b[36m(pid=929827)\u001b[0m {'eval_loss': 0.5350601673126221, 'eval_matthews_correlation': 0.40085080763525827, 'eval_runtime': 1.7316, 'eval_samples_per_second': 602.346, 'epoch': 3.5}\n",
"Trial train_distilbert_14262454 reported loss=0.5350601673126221,matthews_correlation=0.40085080763525827 with parameters={'num_train_epochs': 3.495397786456084, 'learning_rate': 4.834054640339551e-06, 'adam_epsilon': 4.600931186189709e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9916961720574915}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 15/infinite (1 PENDING, 1 RUNNING, 13 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_14262454 completed. Last result: loss=0.5350601673126221,matthews_correlation=0.40085080763525827\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 53.04ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 37.06ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 106.60ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 90.49ba/s]\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m {'train_runtime': 261.0489, 'train_samples_per_second': 6.83, 'epoch': 6.65}\n",
"\u001b[2m\u001b[36m(pid=934238)\u001b[0m {'eval_loss': 0.609851062297821, 'eval_matthews_correlation': 0.5268023551875569, 'eval_runtime': 1.7076, 'eval_samples_per_second': 610.801, 'epoch': 6.65}\n",
"Trial train_distilbert_6d211fe6 reported loss=0.609851062297821,matthews_correlation=0.5268023551875569 with parameters={'num_train_epochs': 6.649954795358705, 'learning_rate': 1.173331172454689e-05, 'adam_epsilon': 5.4079124274855485e-08, 'adam_beta1': 0.9592773086704482, 'adam_beta2': 0.9945564008561629}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.8/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 16/infinite (1 PENDING, 1 RUNNING, 14 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_6d211fe6 completed. Last result: loss=0.609851062297821,matthews_correlation=0.5268023551875569\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 78%|███████▊ | 7/9 [00:00<00:00, 62.15ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 40.78ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 108.16ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 107.36ba/s]\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m {'train_runtime': 101.88, 'train_samples_per_second': 6.616, 'epoch': 2.51}\n",
"\u001b[2m\u001b[36m(pid=942628)\u001b[0m {'eval_loss': 0.5422758460044861, 'eval_matthews_correlation': 0.32496815807366203, 'eval_runtime': 1.7034, 'eval_samples_per_second': 612.32, 'epoch': 2.51}\n",
"Trial train_distilbert_c980bae4 reported loss=0.5422758460044861,matthews_correlation=0.32496815807366203 with parameters={'num_train_epochs': 2.512749499653892, 'learning_rate': 5.2192940076368766e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.99, 'adam_beta2': 0.9933553670097386}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.1/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 17/infinite (1 PENDING, 1 RUNNING, 15 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_c980bae4 completed. Last result: loss=0.5422758460044861,matthews_correlation=0.32496815807366203\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 57.84ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 40.01ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 102.16ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 81.15ba/s]\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m {'train_runtime': 517.8412, 'train_samples_per_second': 7.087, 'epoch': 13.69}\n",
"\u001b[2m\u001b[36m(pid=945904)\u001b[0m {'eval_loss': 0.9238015413284302, 'eval_matthews_correlation': 0.5494735380761103, 'eval_runtime': 1.588, 'eval_samples_per_second': 656.816, 'epoch': 13.69}\n",
"Trial train_distilbert_6d0d29d6 reported loss=0.9238015413284302,matthews_correlation=0.5494735380761103 with parameters={'num_train_epochs': 13.693961965290004, 'learning_rate': 1.1554924963127978e-05, 'adam_epsilon': 9.975195386335823e-08, 'adam_beta1': 0.9657731407927772, 'adam_beta2': 0.9951819129873288}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 32.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 18/infinite (1 PENDING, 1 RUNNING, 16 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_6d0d29d6 completed. Last result: loss=0.9238015413284302,matthews_correlation=0.5494735380761103\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 78%|███████▊ | 7/9 [00:00<00:00, 66.59ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 44.15ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 125.62ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 119.07ba/s]\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m {'train_runtime': 129.7296, 'train_samples_per_second': 6.675, 'epoch': 3.23}\n",
"\u001b[2m\u001b[36m(pid=973869)\u001b[0m {'eval_loss': 0.5334658622741699, 'eval_matthews_correlation': 0.4513069078434825, 'eval_runtime': 1.7406, 'eval_samples_per_second': 599.205, 'epoch': 3.23}\n",
"Trial train_distilbert_b16ea82a reported loss=0.5334658622741699,matthews_correlation=0.4513069078434825 with parameters={'num_train_epochs': 3.229299080310228, 'learning_rate': 1.1914452449037918e-05, 'adam_epsilon': 2.9318239583972084e-08, 'adam_beta1': 0.9527814765481193, 'adam_beta2': 0.9939312818847007}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 19/infinite (1 PENDING, 1 RUNNING, 17 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"Trial train_distilbert_b16ea82a completed. Last result: loss=0.5334658622741699,matthews_correlation=0.4513069078434825\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 55.88ba/s]\n",
" 89%|████████▉ | 8/9 [00:00<00:00, 33.97ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 39.36ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 94.15ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 105.63ba/s]\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m {'train_runtime': 604.2581, 'train_samples_per_second': 6.911, 'epoch': 15.58}\n",
"\u001b[2m\u001b[36m(pid=978003)\u001b[0m {'eval_loss': 0.9832845330238342, 'eval_matthews_correlation': 0.5699304939602442, 'eval_runtime': 1.7051, 'eval_samples_per_second': 611.691, 'epoch': 15.58}\n",
"Trial train_distilbert_eddf7cc0 reported loss=0.9832845330238342,matthews_correlation=0.5699304939602442 with parameters={'num_train_epochs': 15.580684188655825, 'learning_rate': 1.2851507818900338e-05, 'adam_epsilon': 8.134982521948352e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9971094424784387}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.2/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 20/infinite (1 PENDING, 1 RUNNING, 18 TERMINATED)<br><br>"
},
"metadata": {}
},
2021-05-08 02:50:50 +00:00
{
"output_type": "stream",
"name": "stdout",
"text": [
"Trial train_distilbert_eddf7cc0 completed. Last result: loss=0.9832845330238342,matthews_correlation=0.5699304939602442\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 53.75ba/s]\n",
" 89%|████████▉ | 8/9 [00:00<00:00, 32.34ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 37.56ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 106.80ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 106.92ba/s]\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m {'train_runtime': 464.4178, 'train_samples_per_second': 6.946, 'epoch': 12.04}\n",
"\u001b[2m\u001b[36m(pid=1000417)\u001b[0m {'eval_loss': 0.8574612736701965, 'eval_matthews_correlation': 0.5200220944545176, 'eval_runtime': 1.6294, 'eval_samples_per_second': 640.118, 'epoch': 12.04}\n",
"Trial train_distilbert_43008974 reported loss=0.8574612736701965,matthews_correlation=0.5200220944545176 with parameters={'num_train_epochs': 12.035709859477459, 'learning_rate': 1.0389153769735843e-05, 'adam_epsilon': 1e-07, 'adam_beta1': 0.929089139333885, 'adam_beta2': 0.9932581096367817}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
2021-05-08 02:50:50 +00:00
"text/html": "== Status ==<br>Memory usage on this node: 31.4/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 21/infinite (1 PENDING, 1 RUNNING, 19 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"Trial train_distilbert_43008974 completed. Last result: loss=0.8574612736701965,matthews_correlation=0.5200220944545176\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
" 0%| | 0/9 [00:00<?, ?ba/s]\n",
" 67%|██████▋ | 6/9 [00:00<00:00, 57.01ba/s]\n",
"100%|██████████| 9/9 [00:00<00:00, 38.68ba/s]\n",
" 0%| | 0/2 [00:00<?, ?ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 94.45ba/s]\n",
"100%|██████████| 2/2 [00:00<00:00, 106.71ba/s]\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m /home/chiw/.local/lib/python3.8/site-packages/torch/nn/parallel/_functions.py:65: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m warnings.warn('Was asked to gather along dimension 0, but all '\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m {'train_runtime': 464.3935, 'train_samples_per_second': 6.878, 'epoch': 11.92}\n",
"2021-05-07 03:42:29,943\tINFO stopper.py:347 -- Reached timeout of 3600 seconds. Stopping all trials.\n",
"\u001b[2m\u001b[36m(pid=1022436)\u001b[0m {'eval_loss': 0.8282045722007751, 'eval_matthews_correlation': 0.5261643004428046, 'eval_runtime': 1.6945, 'eval_samples_per_second': 615.509, 'epoch': 11.92}\n",
"Trial train_distilbert_b3408a4e reported loss=0.8282045722007751,matthews_correlation=0.5261643004428046 with parameters={'num_train_epochs': 11.916545357570307, 'learning_rate': 1.104181373740261e-05, 'adam_epsilon': 4.674408006864194e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9938089843451122}.\n"
]
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "== Status ==<br>Memory usage on this node: 32.0/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/4 CPUs, 0/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 22/infinite (22 TERMINATED)<br><br>"
},
"metadata": {}
},
{
"output_type": "display_data",
"data": {
"text/plain": "<IPython.core.display.HTML object>",
"text/html": "== Status ==<br>Memory usage on this node: 32.0/251.6 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/4 CPUs, 0/4 GPUs, 0.0/150.39 GiB heap, 0.0/47.22 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /home/chiw/FLAML/notebook/logs/train_distilbert_2021-05-07_02-35-58<br>Number of trials: 22/infinite (22 TERMINATED)<br><table>\n<thead>\n<tr><th>Trial name </th><th>status </th><th>loc </th><th style=\"text-align: right;\"> adam_beta1</th><th style=\"text-align: right;\"> adam_beta2</th><th style=\"text-align: right;\"> adam_epsilon</th><th style=\"text-align: right;\"> learning_rate</th><th style=\"text-align: right;\"> num_train_epochs</th><th style=\"text-align: right;\"> iter</th><th style=\"text-align: right;\"> total time (s)</th><th style=\"text-align: right;\"> loss</th><th style=\"text-align: right;\"> matthews_correlation</th></tr>\n</thead>\n<tbody>\n<tr><td>train_distilbert_a0c303d0</td><td>TERMINATED</td><td> </td><td style=\"text-align: right;\"> 0.939079</td><td style=\"text-align: right;\"> 0.991865</td><td style=\"text-align: right;\"> 7.96945e-08</td><td style=\"text-align: right;\"> 5.61152e-06</td><td style=\"text-align: right;\"> 1 </td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 55.6909</td><td style=\"text-align: right;\">0.587986</td><td style=\"text-align: right;\"> 0 </td></tr>\n<tr><td>train_distilbert_a0c303d1</td><td>TERMINATED</td><td> </td><td style=\"text-align: right;\"> 0.811036</td><td style=\"text-align: right;\"> 0.997214</td><td style=\"text-align: right;\"> 2.05111e-09</td><td style=\"text-align: right;\"> 2.05134e-06</td><td style=\"text-align: right;\"> 1.44427</td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 71.7663</td><td style=\"text-align: right;\">0.603018</td><td style=\"text-align: right;\"> 0 </td></tr>\n<tr><td>train_distilbert_c39b2ef0</td><td>TERMINATED</td><td> </td><td style=\"text-align: right;\"> 0.909395</td><td style=\"text-align: right;\"> 0.993715</td><td style=\"text-align: right;\"> 1e-07 </td><td style=\"text-align: right;\"> 5.26543e-06</td><td style=\"text-align: right;\"> 1 </td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 53.7619</td><td style=\"text-align: right;\">0.586518</td><td style=\"text-align: right;\"> 0 </td></tr>\n<tr><td>train_distilbert_f00776e2</td><td>TERMINATED</td><td> </td><td style=\"text-align: right;\"> 0.968763</td><td style=\"text-align: right;\"> 0.990019</td><td style=\"text-align: right;\"> 4.38943e-08</td><td style=\"text-align: right;\"> 5.98035e-06</td><td style=\"text-align: right;\"> 1.02723</td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 56.8382</td><td style=\"text-align: right;\">0.581313</td><td style=\"text-align: right;\"> 0 </td></tr>\n<tr><td>train_distilbert_11ab3900</td><td>TERMINATED</td><td> </td><td style=\"text-align: right;\"> 0.962198</td><td style=\"text-align: right;\"> 0.991838</td><td style=\"text-align: right;\"> 7.09296e-08</td><td style=\"text-align: right;\"> 5.06608e-06</td><td style=\"text-align: right;\"> 1 </td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 54.0231</td><td style=\"text-align: right;\">0.585576</td><td style=\"text-align: right;\"> 0 </td></tr>\n<tr><td>train_distilbert_353025b6</td><td>TERMINATED</td><td> </td><td style=\"text-align: right;\"> 0.91596 </td><td style=\"text-align: right;\"> 0.991892</td><td style=\"text-align: right;\"> 8.95426e-08</td><td style=\"text-align: right;\"> 6.21568e-06</td><td style=\"text-align: right;\"> 2.15443</td><td style=\"text-align: right;\"> 1</td><td style=\"text-align: right;\"> 98.3233</td><
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stderr",
"text": [
2021-05-08 02:50:50 +00:00
"2021-05-07 03:42:30,035\tINFO tune.py:450 -- Total run time: 3992.00 seconds (3991.90 seconds for the tuning loop).\n"
]
}
],
2021-02-06 16:24:38 -08:00
"source": [
"import time\n",
"import ray\n",
"start_time = time.time()\n",
"ray.shutdown()\n",
2021-02-06 16:24:38 -08:00
"ray.init(num_cpus=num_cpus, num_gpus=num_gpus)\n",
"\n",
"print(\"Tuning started...\")\n",
"analysis = flaml.tune.run(\n",
" train_distilbert,\n",
2021-05-08 02:50:50 +00:00
" search_alg=flaml.CFO(\n",
" space=search_space,\n",
" metric=HP_METRIC,\n",
" mode=MODE,\n",
" low_cost_partial_config={\"num_train_epochs\": 1}),\n",
2021-02-06 16:24:38 -08:00
" report_intermediate_result=False,\n",
" # uncomment the following if report_intermediate_result = True\n",
" # max_resource=max_num_epoch, min_resource=1,\n",
" resources_per_trial={\"gpu\": num_gpus, \"cpu\": num_cpus},\n",
2021-02-06 16:24:38 -08:00
" local_dir='logs/',\n",
" num_samples=num_samples,\n",
" time_budget_s=time_budget_s,\n",
" use_ray=True,\n",
")\n",
"\n",
"ray.shutdown()"
]
},
{
"cell_type": "code",
2021-05-08 02:50:50 +00:00
"execution_count": 26,
2021-02-06 16:24:38 -08:00
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
2021-05-08 02:50:50 +00:00
"n_trials=22\ntime=3999.769361972809\nBest model eval matthews_correlation: 0.5699\nBest model parameters: {'num_train_epochs': 15.580684188655825, 'learning_rate': 1.2851507818900338e-05, 'adam_epsilon': 8.134982521948352e-08, 'adam_beta1': 0.99, 'adam_beta2': 0.9971094424784387}\n"
]
}
],
2021-02-06 16:24:38 -08:00
"source": [
"best_trial = analysis.get_best_trial(HP_METRIC, MODE, \"all\")\n",
"metric = best_trial.metric_analysis[HP_METRIC][MODE]\n",
"print(f\"n_trials={len(analysis.trials)}\")\n",
"print(f\"time={time.time()-start_time}\")\n",
"print(f\"Best model eval {HP_METRIC}: {metric:.4f}\")\n",
"print(f\"Best model parameters: {best_trial.config}\")\n"
]
},
{
"source": [
"## Next Steps\n",
"\n",
"Notice that we only reported the metric with `flaml.tune.report` at the end of full training loop. It is possible to enable reporting of intermediate performance - allowing early stopping - as follows:\n",
"\n",
"- Huggingface provides _Callbacks_ which can be used to insert the `flaml.tune.report` call inside the training loop\n",
"- Make sure to set `do_eval=True` in the `TrainingArguments` provided to `Trainer` and adjust the evaluation frequency accordingly"
2021-02-06 16:24:38 -08:00
],
"cell_type": "markdown",
"metadata": {}
}
],
"metadata": {
"kernelspec": {
2021-05-08 02:50:50 +00:00
"name": "python385jvsc74a57bd031f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6",
"display_name": "Python 3.8.5 64-bit"
2021-02-06 16:24:38 -08:00
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
2021-05-08 02:50:50 +00:00
"version": "3.8.5"
},
"metadata": {
"interpreter": {
"hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
}
2021-02-06 16:24:38 -08:00
}
},
"nbformat": 4,
"nbformat_minor": 4
}