autogen/notebook/flaml_finetune_transformer.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This notebook uses flaml to finetune a transformer model from Huggingface transformers library.\n",
    "\n",
    "**Requirements.** This notebook has additional requirements:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install torch transformers datasets ipywidgets"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoTokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL_CHECKPOINT = \"distilbert-base-uncased\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "{'input_ids': [101, 2023, 2003, 1037, 3231, 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}"
      ]
     },
     "metadata": {},
     "execution_count": 106
    }
   ],
   "source": [
    "tokenizer(\"this is a test\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "TASK = \"cola\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n"
     ]
    }
   ],
   "source": [
    "raw_dataset = datasets.load_dataset(\"glue\", TASK)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [],
   "source": [
    "# define tokenization function used to process data\n",
    "COLUMN_NAME = \"sentence\"\n",
    "def tokenize(examples):\n",
    "    return tokenizer(examples[COLUMN_NAME], truncation=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c3dd50f05994d4a5.arrow\n",
      "Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-f2290a23c3c6f190.arrow\n",
      "Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-6868a7b57fb52895.arrow\n"
     ]
    }
   ],
   "source": [
    "encoded_dataset = raw_dataset.map(tokenize, batched=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
       " 'idx': 0,\n",
       " 'input_ids': [101,\n",
       "  2256,\n",
       "  2814,\n",
       "  2180,\n",
       "  1005,\n",
       "  1056,\n",
       "  4965,\n",
       "  2023,\n",
       "  4106,\n",
       "  1010,\n",
       "  2292,\n",
       "  2894,\n",
       "  1996,\n",
       "  2279,\n",
       "  2028,\n",
       "  2057,\n",
       "  16599,\n",
       "  1012,\n",
       "  102],\n",
       " 'label': 1,\n",
       " 'sentence': \"Our friends won't buy this analysis, let alone the next one we propose.\"}"
      ]
     },
     "metadata": {},
     "execution_count": 112
    }
   ],
   "source": [
    "encoded_dataset[\"train\"][0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import AutoModelForSequenceClassification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
     ]
    }
   ],
   "source": [
    "NUM_LABELS = 2\n",
    "model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=NUM_LABELS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "DistilBertForSequenceClassification(\n",
       "  (distilbert): DistilBertModel(\n",
       "    (embeddings): Embeddings(\n",
       "      (word_embeddings): Embedding(30522, 768, padding_idx=0)\n",
       "      (position_embeddings): Embedding(512, 768)\n",
       "      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "      (dropout): Dropout(p=0.1, inplace=False)\n",
       "    )\n",
       "    (transformer): Transformer(\n",
       "      (layer): ModuleList(\n",
       "        (0): TransformerBlock(\n",
       "          (attention): MultiHeadSelfAttention(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "          )\n",
       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "          (ffn): FFN(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
       "          )\n",
       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "        )\n",
       "        (1): TransformerBlock(\n",
       "          (attention): MultiHeadSelfAttention(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "          )\n",
       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "          (ffn): FFN(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
       "          )\n",
       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "        )\n",
       "        (2): TransformerBlock(\n",
       "          (attention): MultiHeadSelfAttention(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "          )\n",
       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "          (ffn): FFN(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
       "          )\n",
       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "        )\n",
       "        (3): TransformerBlock(\n",
       "          (attention): MultiHeadSelfAttention(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "          )\n",
       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "          (ffn): FFN(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
       "          )\n",
       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "        )\n",
       "        (4): TransformerBlock(\n",
       "          (attention): MultiHeadSelfAttention(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "          )\n",
       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "          (ffn): FFN(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
       "          )\n",
       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "        )\n",
       "        (5): TransformerBlock(\n",
       "          (attention): MultiHeadSelfAttention(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (q_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (k_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (v_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "            (out_lin): Linear(in_features=768, out_features=768, bias=True)\n",
       "          )\n",
       "          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "          (ffn): FFN(\n",
       "            (dropout): Dropout(p=0.1, inplace=False)\n",
       "            (lin1): Linear(in_features=768, out_features=3072, bias=True)\n",
       "            (lin2): Linear(in_features=3072, out_features=768, bias=True)\n",
       "          )\n",
       "          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n",
       "        )\n",
       "      )\n",
       "    )\n",
       "  )\n",
       "  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)\n",
       "  (classifier): Linear(in_features=768, out_features=2, bias=True)\n",
       "  (dropout): Dropout(p=0.2, inplace=False)\n",
       ")"
      ]
     },
     "metadata": {},
     "execution_count": 115
    }
   ],
   "source": [
    "model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "metric = datasets.load_metric(\"glue\", TASK)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "Metric(name: \"glue\", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: \"\"\"\n",
       "Compute GLUE evaluation metric associated to each GLUE dataset.\n",
       "Args:\n",
       "    predictions: list of predictions to score.\n",
       "        Each translation should be tokenized into a list of tokens.\n",
       "    references: list of lists of references for each translation.\n",
       "        Each reference should be tokenized into a list of tokens.\n",
       "Returns: depending on the GLUE subset, one or several of:\n",
       "    \"accuracy\": Accuracy\n",
       "    \"f1\": F1 score\n",
       "    \"pearson\": Pearson Correlation\n",
       "    \"spearmanr\": Spearman Correlation\n",
       "    \"matthews_correlation\": Matthew Correlation\n",
       "Examples:\n",
       "\n",
       "    >>> glue_metric = datasets.load_metric('glue', 'sst2')  # 'sst2' or any of [\"mnli\", \"mnli_mismatched\", \"mnli_matched\", \"qnli\", \"rte\", \"wnli\", \"hans\"]\n",
       "    >>> references = [0, 1]\n",
       "    >>> predictions = [0, 1]\n",
       "    >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
       "    >>> print(results)\n",
       "    {'accuracy': 1.0}\n",
       "\n",
       "    >>> glue_metric = datasets.load_metric('glue', 'mrpc')  # 'mrpc' or 'qqp'\n",
       "    >>> references = [0, 1]\n",
       "    >>> predictions = [0, 1]\n",
       "    >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
       "    >>> print(results)\n",
       "    {'accuracy': 1.0, 'f1': 1.0}\n",
       "\n",
       "    >>> glue_metric = datasets.load_metric('glue', 'stsb')\n",
       "    >>> references = [0., 1., 2., 3., 4., 5.]\n",
       "    >>> predictions = [0., 1., 2., 3., 4., 5.]\n",
       "    >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
       "    >>> print({\"pearson\": round(results[\"pearson\"], 2), \"spearmanr\": round(results[\"spearmanr\"], 2)})\n",
       "    {'pearson': 1.0, 'spearmanr': 1.0}\n",
       "\n",
       "    >>> glue_metric = datasets.load_metric('glue', 'cola')\n",
       "    >>> references = [0, 1]\n",
       "    >>> predictions = [0, 1]\n",
       "    >>> results = glue_metric.compute(predictions=predictions, references=references)\n",
       "    >>> print(results)\n",
       "    {'matthews_correlation': 1.0}\n",
       "\"\"\", stored examples: 0)"
      ]
     },
     "metadata": {},
     "execution_count": 117
    }
   ],
   "source": [
    "metric"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "def compute_metrics(eval_pred):\n",
    "    predictions, labels = eval_pred\n",
    "    predictions = np.argmax(predictions, axis=1)\n",
    "    return metric.compute(predictions=predictions, references=labels)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training (aka Finetuning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import Trainer\n",
    "from transformers import TrainingArguments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "args = TrainingArguments(\n",
    "    output_dir='output',\n",
    "    do_eval=True,\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=args,\n",
    "    train_dataset=encoded_dataset[\"train\"],\n",
    "    eval_dataset=encoded_dataset[\"validation\"],\n",
    "    tokenizer=tokenizer,\n",
    "    compute_metrics=compute_metrics,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "source": [
    "## Hyperparameter Optimization\n",
    "\n",
    "`flaml.tune` is a module for economical hyperparameter tuning. It frees users from manually tuning many hyperparameters for a software, such as machine learning training procedures. \n",
    "The API is compatible with ray tune.\n",
    "\n",
    "### Step 1. Define training method\n",
    "\n",
    "We define a function `train_distilbert(config: dict)` that accepts a hyperparameter configuration dict `config`. The specific configs will be generated by flaml's search algorithm in a given search space.\n"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "import flaml\n",
    "\n",
    "def train_distilbert(config: dict):\n",
    "\n",
    "    # Define tokenize method\n",
    "    tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, use_fast=True)\n",
    "    def tokenize(examples):\n",
    "        return tokenizer(examples[COLUMN_NAME], truncation=True)\n",
    "\n",
    "    # Load CoLA dataset and apply tokenizer\n",
    "    cola_raw = datasets.load_dataset(\"glue\", TASK)\n",
    "    cola_encoded = cola_raw.map(tokenize, batched=True)\n",
    "    # QUESTION: Write processed data to disk?\n",
    "    train_dataset, eval_dataset = cola_encoded[\"train\"], cola_encoded[\"validation\"]\n",
    "\n",
    "    model = AutoModelForSequenceClassification.from_pretrained(\n",
    "        MODEL_CHECKPOINT, num_labels=NUM_LABELS\n",
    "    )\n",
    "\n",
    "    metric = datasets.load_metric(\"glue\", TASK)\n",
    "    def compute_metrics(eval_pred):\n",
    "        predictions, labels = eval_pred\n",
    "        predictions = np.argmax(predictions, axis=1)\n",
    "        return metric.compute(predictions=predictions, references=labels)\n",
    "\n",
    "    training_args = TrainingArguments(\n",
    "        output_dir='.',\n",
    "        do_eval=False,\n",
    "        disable_tqdm=True,\n",
    "        logging_steps=20000,\n",
    "        save_total_limit=0,\n",
    "        **config,\n",
    "    )\n",
    "\n",
    "    trainer = Trainer(\n",
    "        model,\n",
    "        training_args,\n",
    "        train_dataset=train_dataset,\n",
    "        eval_dataset=eval_dataset,\n",
    "        tokenizer=tokenizer,\n",
    "        compute_metrics=compute_metrics,\n",
    "    )\n",
    "\n",
    "    # train model\n",
    "    trainer.train()\n",
    "\n",
    "    # evaluate model\n",
    "    eval_output = trainer.evaluate()\n",
    "\n",
    "    # report the metric to optimize\n",
    "    flaml.tune.report(\n",
    "        loss=eval_output[\"eval_loss\"],\n",
    "        matthews_correlation=eval_output[\"eval_matthews_correlation\"],\n",
    "        )"
   ]
  },
  {
   "source": [
    "### Step 2. Define the search\n",
    "\n",
    "We are now ready to define our search. This includes:\n",
    "\n",
    "- The `search_space` for our hyperparameters\n",
    "- The metric and the mode ('max' or 'min') for optimization\n",
    "- The constraints (`n_cpus`, `n_gpus`, `num_samples`, and `time_budget_s`)"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "max_num_epoch = 64\n",
    "search_space = {\n",
    "        # You can mix constants with search space objects.\n",
    "        \"num_train_epochs\": flaml.tune.loguniform(1, max_num_epoch),\n",
    "        \"learning_rate\": flaml.tune.loguniform(1e-6, 1e-4),\n",
    "        \"adam_epsilon\": flaml.tune.loguniform(1e-9, 1e-7),\n",
    "        \"adam_beta1\": flaml.tune.uniform(0.8, 0.99),\n",
    "        \"adam_beta2\": flaml.tune.loguniform(98e-2, 9999e-4),\n",
    "    }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "# optimization objective\n",
    "HP_METRIC, MODE = \"matthews_correlation\", \"max\"\n",
    "\n",
    "# resources\n",
    "num_cpus = 4\n",
    "num_gpus = 4\n",
    "\n",
    "# constraints\n",
    "num_samples = -1    # number of trials, -1 means unlimited\n",
    "time_budget_s = 3600    # time budget in seconds"
   ]
  },
  {
   "source": [
    "### Step 3. Launch with `flaml.tune.run`\n",
    "\n",
    "We are now ready to launch the tuning using `flaml.tune.run`:"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "2021-02-24 13:56:21,166\tINFO services.py:1173 -- View the Ray dashboard at \u001b[1m\u001b[32mhttp://127.0.0.1:8265\u001b[39m\u001b[22m\n",
      "2021-02-24 13:56:21,951\tWARNING optuna.py:126 -- You passed a `space` parameter to <class 'ray.tune.suggest.optuna.OptunaSearch'> that contained unresolved search space definitions. <class 'ray.tune.suggest.optuna.OptunaSearch'> should however be instantiated with fully configured search spaces only. To use Ray Tune's automatic search space conversion, pass the space definition as part of the `config` argument to `tune.run()` instead.\n",
      "\u001b[32m[I 2021-02-24 13:56:21,955]\u001b[0m A new study created in memory with name: optuna\u001b[0m\n",
      "Tuning started...\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 11.7/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 1/infinite (1 RUNNING)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m {'train_runtime': 37.2833, 'train_samples_per_second': 7.188, 'epoch': 1.0}\n",
      "Trial train_distilbert_21b2c490 reported matthews_correlation=0.00 with parameters={'num_train_epochs': 1, 'learning_rate': 5.61151641533451e-06, 'adam_epsilon': 7.969454818643929e-08, 'adam_beta1': 0.9390788489441669, 'adam_beta2': 0.99186521389353}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.0/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 21b2c490 with matthews_correlation=0.0 and parameters={'num_train_epochs': 1, 'learning_rate': 5.61151641533451e-06, 'adam_epsilon': 7.969454818643929e-08, 'adam_beta1': 0.9390788489441669, 'adam_beta2': 0.99186521389353}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 2/infinite (1 PENDING, 1 RUNNING)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_21b2c490 completed. Last result: loss=0.5786514282226562,matthews_correlation=0.0\n",
      "\u001b[2m\u001b[36m(pid=29589)\u001b[0m {'eval_loss': 0.5786514282226562, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.8133, 'eval_samples_per_second': 575.184, 'epoch': 1.0}\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m {'train_runtime': 205.6814, 'train_samples_per_second': 8.469, 'epoch': 6.5}\n",
      "Trial train_distilbert_21b2c491 reported matthews_correlation=0.51 with parameters={'num_train_epochs': 6.496661243646011, 'learning_rate': 3.1345403715761375e-05, 'adam_epsilon': 1.2428131101359459e-08, 'adam_beta1': 0.9100859688137786, 'adam_beta2': 0.9850788361346603}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.5/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 21b2c491 with matthews_correlation=0.5093030018169853 and parameters={'num_train_epochs': 6.496661243646011, 'learning_rate': 3.1345403715761375e-05, 'adam_epsilon': 1.2428131101359459e-08, 'adam_beta1': 0.9100859688137786, 'adam_beta2': 0.9850788361346603}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 3/infinite (1 PENDING, 1 RUNNING, 1 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_21b2c491 completed. Last result: loss=0.9910964965820312,matthews_correlation=0.5093030018169853\n",
      "\u001b[2m\u001b[36m(pid=29588)\u001b[0m {'eval_loss': 0.9910964965820312, 'eval_matthews_correlation': 0.5093030018169853, 'eval_runtime': 1.8366, 'eval_samples_per_second': 567.883, 'epoch': 6.5}\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m {'train_runtime': 37.2801, 'train_samples_per_second': 7.189, 'epoch': 1.0}\n",
      "Trial train_distilbert_3f0da820 reported matthews_correlation=0.00 with parameters={'num_train_epochs': 1.0, 'learning_rate': 5.265428651017862e-06, 'adam_epsilon': 1e-07, 'adam_beta1': 0.9093950363089345, 'adam_beta2': 0.9937145453421068}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.7/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 21b2c491 with matthews_correlation=0.5093030018169853 and parameters={'num_train_epochs': 6.496661243646011, 'learning_rate': 3.1345403715761375e-05, 'adam_epsilon': 1.2428131101359459e-08, 'adam_beta1': 0.9100859688137786, 'adam_beta2': 0.9850788361346603}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 4/infinite (1 PENDING, 1 RUNNING, 2 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_3f0da820 completed. Last result: loss=0.5775065422058105,matthews_correlation=0.0\n",
      "\u001b[2m\u001b[36m(pid=29591)\u001b[0m {'eval_loss': 0.5775065422058105, 'eval_matthews_correlation': 0.0, 'eval_runtime': 1.7547, 'eval_samples_per_second': 594.388, 'epoch': 1.0}\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m {'train_runtime': 197.3016, 'train_samples_per_second': 8.591, 'epoch': 6.32}\n",
      "Trial train_distilbert_c1106c22 reported matthews_correlation=0.55 with parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 15.9/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 5/infinite (1 PENDING, 1 RUNNING, 3 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_c1106c22 completed. Last result: loss=0.8939734101295471,matthews_correlation=0.5451837431775948\n",
      "\u001b[2m\u001b[36m(pid=29590)\u001b[0m {'eval_loss': 0.8939734101295471, 'eval_matthews_correlation': 0.5451837431775948, 'eval_runtime': 1.8277, 'eval_samples_per_second': 570.669, 'epoch': 6.32}\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m {'train_runtime': 105.8952, 'train_samples_per_second': 7.847, 'epoch': 3.1}\n",
      "Trial train_distilbert_de95f5e6 reported matthews_correlation=0.48 with parameters={'num_train_epochs': 3.097601049860023, 'learning_rate': 3.015866216468612e-05, 'adam_epsilon': 6.092346813998939e-09, 'adam_beta1': 0.9628888910610184, 'adam_beta2': 0.9832186589335725}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.3/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 6/infinite (1 PENDING, 1 RUNNING, 4 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_de95f5e6 completed. Last result: loss=0.5720887780189514,matthews_correlation=0.48369222635456827\n",
      "\u001b[2m\u001b[36m(pid=8754)\u001b[0m {'eval_loss': 0.5720887780189514, 'eval_matthews_correlation': 0.48369222635456827, 'eval_runtime': 1.8561, 'eval_samples_per_second': 561.936, 'epoch': 3.1}\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m {'train_runtime': 330.1466, 'train_samples_per_second': 8.732, 'epoch': 10.76}\n",
      "Trial train_distilbert_5bb0a1fc reported matthews_correlation=0.53 with parameters={'num_train_epochs': 10.755455977982155, 'learning_rate': 5.858103269448852e-05, 'adam_epsilon': 5.045085830072572e-08, 'adam_beta1': 0.845137019185222, 'adam_beta2': 0.9882166289933315}.\n",
      "\u001b[2m\u001b[36m(pid=12777)\u001b[0m {'eval_loss': 1.5075323581695557, 'eval_matthews_correlation': 0.5282404248888111, 'eval_runtime': 1.7504, 'eval_samples_per_second': 595.853, 'epoch': 10.76}\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 15.9/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 7/infinite (1 PENDING, 1 RUNNING, 5 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_5bb0a1fc completed. Last result: loss=1.5075323581695557,matthews_correlation=0.5282404248888111\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m {'train_runtime': 182.3796, 'train_samples_per_second': 8.724, 'epoch': 5.94}\n",
      "Trial train_distilbert_a247fb2e reported matthews_correlation=0.54 with parameters={'num_train_epochs': 5.933063389003551, 'learning_rate': 1.845204084769373e-05, 'adam_epsilon': 1.372505378696326e-08, 'adam_beta1': 0.8534841230874768, 'adam_beta2': 0.9858475457825921}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.4/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 8/infinite (1 PENDING, 1 RUNNING, 6 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_a247fb2e completed. Last result: loss=0.6974263191223145,matthews_correlation=0.5399503104637741\n",
      "\u001b[2m\u001b[36m(pid=39770)\u001b[0m {'eval_loss': 0.6974263191223145, 'eval_matthews_correlation': 0.5399503104637741, 'eval_runtime': 1.8585, 'eval_samples_per_second': 561.204, 'epoch': 5.94}\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m {'train_runtime': 189.7562, 'train_samples_per_second': 8.59, 'epoch': 6.08}\n",
      "Trial train_distilbert_6e9e8ec2 reported matthews_correlation=0.52 with parameters={'num_train_epochs': 6.078693989748608, 'learning_rate': 1.8357895987910622e-05, 'adam_epsilon': 1.5849146381322022e-08, 'adam_beta1': 0.8904370071918882, 'adam_beta2': 0.9844583428325462}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 17.1/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 9/infinite (1 PENDING, 1 RUNNING, 7 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_6e9e8ec2 completed. Last result: loss=0.7202959656715393,matthews_correlation=0.5185394246694179\n",
      "\u001b[2m\u001b[36m(pid=7123)\u001b[0m {'eval_loss': 0.7202959656715393, 'eval_matthews_correlation': 0.5185394246694179, 'eval_runtime': 1.6051, 'eval_samples_per_second': 649.814, 'epoch': 6.08}\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m {'train_runtime': 329.789, 'train_samples_per_second': 8.448, 'epoch': 10.4}\n",
      "Trial train_distilbert_e30fd860 reported matthews_correlation=0.54 with parameters={'num_train_epochs': 10.39182109947885, 'learning_rate': 6.762356226483751e-05, 'adam_epsilon': 5.0195217227379364e-08, 'adam_beta1': 0.8951148565195837, 'adam_beta2': 0.9914274194005184}.\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m {'eval_loss': 1.505250334739685, 'eval_matthews_correlation': 0.5353569722427551, 'eval_runtime': 1.8314, 'eval_samples_per_second': 569.522, 'epoch': 10.4}\n",
      "\u001b[2m\u001b[36m(pid=14798)\u001b[0m \n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 15.9/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: c1106c22 with matthews_correlation=0.5451837431775948 and parameters={'num_train_epochs': 6.324445967486241, 'learning_rate': 2.9412189965562634e-05, 'adam_epsilon': 2.256452443236495e-08, 'adam_beta1': 0.880402156178546, 'adam_beta2': 0.9869155143904086}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 10/infinite (1 PENDING, 1 RUNNING, 8 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_e30fd860 completed. Last result: loss=1.505250334739685,matthews_correlation=0.5353569722427551\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m {'train_runtime': 259.759, 'train_samples_per_second': 9.078, 'epoch': 8.8}\n",
      "Trial train_distilbert_5bddb1ae reported matthews_correlation=0.55 with parameters={'num_train_epochs': 8.797715187430134, 'learning_rate': 2.72412577596775e-05, 'adam_epsilon': 7.4151444539151255e-09, 'adam_beta1': 0.869942964703411, 'adam_beta2': 0.9852670758817403}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.8/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 5bddb1ae with matthews_correlation=0.5492247863049868 and parameters={'num_train_epochs': 8.797715187430134, 'learning_rate': 2.72412577596775e-05, 'adam_epsilon': 7.4151444539151255e-09, 'adam_beta1': 0.869942964703411, 'adam_beta2': 0.9852670758817403}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 11/infinite (1 PENDING, 1 RUNNING, 9 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_5bddb1ae completed. Last result: loss=1.0900800228118896,matthews_correlation=0.5492247863049868\n",
      "\u001b[2m\u001b[36m(pid=27867)\u001b[0m {'eval_loss': 1.0900800228118896, 'eval_matthews_correlation': 0.5492247863049868, 'eval_runtime': 1.6198, 'eval_samples_per_second': 643.889, 'epoch': 8.8}\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m {'train_runtime': 251.169, 'train_samples_per_second': 8.544, 'epoch': 8.01}\n",
      "Trial train_distilbert_27da6108 reported matthews_correlation=0.55 with parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.1/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 12/infinite (1 PENDING, 1 RUNNING, 10 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_27da6108 completed. Last result: loss=0.8646725416183472,matthews_correlation=0.550740569901542\n",
      "\u001b[2m\u001b[36m(pid=38727)\u001b[0m {'eval_loss': 0.8646725416183472, 'eval_matthews_correlation': 0.550740569901542, 'eval_runtime': 1.7453, 'eval_samples_per_second': 597.588, 'epoch': 8.01}\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m {'train_runtime': 150.7963, 'train_samples_per_second': 8.641, 'epoch': 4.86}\n",
      "Trial train_distilbert_ca4167f2 reported matthews_correlation=0.55 with parameters={'num_train_epochs': 4.8609021804212205, 'learning_rate': 3.0765755916918634e-05, 'adam_epsilon': 3.2784085089990583e-09, 'adam_beta1': 0.9001311340399742, 'adam_beta2': 0.9865549219923857}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.7/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 13/infinite (1 PENDING, 1 RUNNING, 11 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_ca4167f2 completed. Last result: loss=0.7426601052284241,matthews_correlation=0.5474713423103301\n",
      "\u001b[2m\u001b[36m(pid=8698)\u001b[0m {'eval_loss': 0.7426601052284241, 'eval_matthews_correlation': 0.5474713423103301, 'eval_runtime': 1.6955, 'eval_samples_per_second': 615.172, 'epoch': 4.86}\n",
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m {'train_runtime': 168.574, 'train_samples_per_second': 8.56, 'epoch': 5.38}\n",
      "Trial train_distilbert_6776ad66 reported matthews_correlation=0.50 with parameters={'num_train_epochs': 5.381515555130151, 'learning_rate': 1.4923436298344364e-05, 'adam_epsilon': 4.718609673277113e-08, 'adam_beta1': 0.8855356638050199, 'adam_beta2': 0.9817714112199931}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 15.7/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 14/infinite (1 PENDING, 1 RUNNING, 12 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "\u001b[2m\u001b[36m(pid=26401)\u001b[0m {'eval_loss': 0.6062898635864258, 'eval_matthews_correlation': 0.5039642659976749, 'eval_runtime': 1.8481, 'eval_samples_per_second': 564.358, 'epoch': 5.38}\n",
      "Trial train_distilbert_6776ad66 completed. Last result: loss=0.6062898635864258,matthews_correlation=0.5039642659976749\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m {'train_runtime': 267.304, 'train_samples_per_second': 8.694, 'epoch': 8.67}\n",
      "Trial train_distilbert_c904a63c reported matthews_correlation=0.54 with parameters={'num_train_epochs': 8.670157213614129, 'learning_rate': 3.589310669581693e-05, 'adam_epsilon': 1e-07, 'adam_beta1': 0.9159421419473668, 'adam_beta2': 0.9870278515925665}.\n",
      "\u001b[2m\u001b[36m(pid=36494)\u001b[0m {'eval_loss': 1.15528404712677, 'eval_matthews_correlation': 0.541934635424655, 'eval_runtime': 1.8046, 'eval_samples_per_second': 577.975, 'epoch': 8.67}\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.4/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 15/infinite (1 PENDING, 1 RUNNING, 13 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_c904a63c completed. Last result: loss=1.15528404712677,matthews_correlation=0.541934635424655\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m {'train_runtime': 401.1267, 'train_samples_per_second': 8.808, 'epoch': 13.18}\n",
      "Trial train_distilbert_34cd23b2 reported matthews_correlation=0.54 with parameters={'num_train_epochs': 13.180325143440442, 'learning_rate': 1.1392631517503339e-05, 'adam_epsilon': 8.551227707433237e-08, 'adam_beta1': 0.8917360114521684, 'adam_beta2': 0.9933954023113967}.\n",
      "\u001b[2m\u001b[36m(pid=7128)\u001b[0m {'eval_loss': 0.9118097424507141, 'eval_matthews_correlation': 0.5361146089547957, 'eval_runtime': 1.6269, 'eval_samples_per_second': 641.089, 'epoch': 13.18}\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.4/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 16/infinite (1 PENDING, 1 RUNNING, 14 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_34cd23b2 completed. Last result: loss=0.9118097424507141,matthews_correlation=0.5361146089547957\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m {'train_runtime': 261.9267, 'train_samples_per_second': 8.548, 'epoch': 8.35}\n",
      "Trial train_distilbert_dbc01c60 reported matthews_correlation=0.53 with parameters={'num_train_epochs': 8.351740081197375, 'learning_rate': 4.14474164779562e-05, 'adam_epsilon': 2.5536744573294183e-08, 'adam_beta1': 0.9010345773126118, 'adam_beta2': 0.98213801095907}.\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 16.0/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 4/4 CPUs, 4/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 17/infinite (1 PENDING, 1 RUNNING, 15 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Trial train_distilbert_dbc01c60 completed. Last result: loss=1.270609974861145,matthews_correlation=0.5331291095663535\n",
      "\u001b[2m\u001b[36m(pid=23493)\u001b[0m {'eval_loss': 1.270609974861145, 'eval_matthews_correlation': 0.5331291095663535, 'eval_runtime': 1.7863, 'eval_samples_per_second': 583.876, 'epoch': 8.35}\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Reusing dataset glue (/home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4)\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-bec756fc24993464.arrow\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-3b411a778de4d998.arrow\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Loading cached processed dataset at /home/chiw/.cache/huggingface/datasets/glue/cola/1.0.0/7c99657241149a24692c402a5c3f34d4c9f1df5ac2e4c3759fadea38f6cb29c4/cache-c7231adac87a0159.arrow\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m - This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m - This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m {'train_runtime': 307.947, 'train_samples_per_second': 8.501, 'epoch': 9.77}\n",
      "2021-02-24 15:01:18,861\tINFO stopper.py:193 -- Reached timeout of 3600 seconds. Stopping all trials.\n",
      "Trial train_distilbert_d1e00f7e reported matthews_correlation=0.50 with parameters={'num_train_epochs': 9.768470529742105, 'learning_rate': 7.278242504625585e-06, 'adam_epsilon': 9.024121328462365e-08, 'adam_beta1': 0.9568651413276459, 'adam_beta2': 0.9898624818542463}.\n",
      "\u001b[2m\u001b[36m(pid=33982)\u001b[0m {'eval_loss': 0.6356746554374695, 'eval_matthews_correlation': 0.502884728860933, 'eval_runtime': 1.7441, 'eval_samples_per_second': 598.03, 'epoch': 9.77}\n"
     ]
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 15.9/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/4 CPUs, 0/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 18/infinite (18 TERMINATED)<br><br>"
     },
     "metadata": {}
    },
    {
     "output_type": "display_data",
     "data": {
      "text/plain": "<IPython.core.display.HTML object>",
      "text/html": "== Status ==<br>Memory usage on this node: 15.9/251.8 GiB<br>Using FIFO scheduling algorithm.<br>Resources requested: 0/4 CPUs, 0/4 GPUs, 0.0/161.91 GiB heap, 0.0/50.63 GiB objects (0/1.0 accelerator_type:V100)<br>Current best trial: 27da6108 with matthews_correlation=0.550740569901542 and parameters={'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}<br>Result logdir: /raid/chiw/FLAML/notebook/logs/train_distilbert_2021-02-24_13-56-21<br>Number of trials: 18/infinite (18 TERMINATED)<br><table>\n<thead>\n<tr><th>Trial name               </th><th>status    </th><th>loc  </th><th style=\"text-align: right;\">  adam_beta1</th><th style=\"text-align: right;\">  adam_beta2</th><th style=\"text-align: right;\">  adam_epsilon</th><th style=\"text-align: right;\">  learning_rate</th><th style=\"text-align: right;\">  num_train_epochs</th><th style=\"text-align: right;\">  iter</th><th style=\"text-align: right;\">  total time (s)</th><th style=\"text-align: right;\">    loss</th><th style=\"text-align: right;\">  matthews_correlation</th></tr>\n</thead>\n<tbody>\n<tr><td>train_distilbert_21b2c490</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.939079</td><td style=\"text-align: right;\">    0.991865</td><td style=\"text-align: right;\">   7.96945e-08</td><td style=\"text-align: right;\">    5.61152e-06</td><td style=\"text-align: right;\">           1      </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         46.9698</td><td style=\"text-align: right;\">0.578651</td><td style=\"text-align: right;\">              0       </td></tr>\n<tr><td>train_distilbert_21b2c491</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.910086</td><td style=\"text-align: right;\">    0.985079</td><td style=\"text-align: right;\">   1.24281e-08</td><td style=\"text-align: right;\">    3.13454e-05</td><td style=\"text-align: right;\">           6.49666</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        215.872 </td><td style=\"text-align: right;\">0.991096</td><td style=\"text-align: right;\">              0.509303</td></tr>\n<tr><td>train_distilbert_3f0da820</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.909395</td><td style=\"text-align: right;\">    0.993715</td><td style=\"text-align: right;\">   1e-07      </td><td style=\"text-align: right;\">    5.26543e-06</td><td style=\"text-align: right;\">           1      </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">         47.3068</td><td style=\"text-align: right;\">0.577507</td><td style=\"text-align: right;\">              0       </td></tr>\n<tr><td>train_distilbert_c1106c22</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.880402</td><td style=\"text-align: right;\">    0.986916</td><td style=\"text-align: right;\">   2.25645e-08</td><td style=\"text-align: right;\">    2.94122e-05</td><td style=\"text-align: right;\">           6.32445</td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        207.618 </td><td style=\"text-align: right;\">0.893973</td><td style=\"text-align: right;\">              0.545184</td></tr>\n<tr><td>train_distilbert_de95f5e6</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.962889</td><td style=\"text-align: right;\">    0.983219</td><td style=\"text-align: right;\">   6.09235e-09</td><td style=\"text-align: right;\">    3.01587e-05</td><td style=\"text-align: right;\">           3.0976 </td><td style=\"text-align: right;\">     1</td><td style=\"text-align: right;\">        115.872 </td><td style=\"text-align: right;\">0.572089</td><td style=\"text-align: right;\">              0.483692</td></tr>\n<tr><td>train_distilbert_5bb0a1fc</td><td>TERMINATED</td><td>     </td><td style=\"text-align: right;\">    0.845137</td><td style=\"text-align: right;\"
     },
     "metadata": {}
    },
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "2021-02-24 15:01:18,957\tINFO tune.py:448 -- Total run time: 3897.00 seconds (3896.97 seconds for the tuning loop).\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "import ray\n",
    "start_time = time.time()\n",
    "ray.shutdown()\n",
    "ray.init(num_cpus=num_cpus, num_gpus=num_gpus)\n",
    "\n",
    "print(\"Tuning started...\")\n",
    "analysis = flaml.tune.run(\n",
    "    train_distilbert,\n",
    "    config=search_space,\n",
    "    init_config={\n",
    "        \"num_train_epochs\": 1,\n",
    "    },\n",
    "    metric=HP_METRIC,\n",
    "    mode=MODE,\n",
    "    report_intermediate_result=False,\n",
    "    # uncomment the following if report_intermediate_result = True\n",
    "    # max_resource=max_num_epoch, min_resource=1,\n",
    "    resources_per_trial={\"gpu\": num_gpus, \"cpu\": num_cpus},\n",
    "    local_dir='logs/',\n",
    "    num_samples=num_samples,\n",
    "    time_budget_s=time_budget_s,\n",
    "    use_ray=True,\n",
    ")\n",
    "\n",
    "ray.shutdown()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "n_trials=18\ntime=3903.5583679676056\nBest model eval matthews_correlation: 0.5507\nBest model parameters: {'num_train_epochs': 8.005678804316002, 'learning_rate': 1.931832460928058e-05, 'adam_epsilon': 6.696984191794608e-08, 'adam_beta1': 0.9116736888940158, 'adam_beta2': 0.9869397626562693}\n"
     ]
    }
   ],
   "source": [
    "best_trial = analysis.get_best_trial(HP_METRIC, MODE, \"all\")\n",
    "metric = best_trial.metric_analysis[HP_METRIC][MODE]\n",
    "print(f\"n_trials={len(analysis.trials)}\")\n",
    "print(f\"time={time.time()-start_time}\")\n",
    "print(f\"Best model eval {HP_METRIC}: {metric:.4f}\")\n",
    "print(f\"Best model parameters: {best_trial.config}\")\n"
   ]
  },
  {
   "source": [
    "## Next Steps\n",
    "\n",
    "Notice that we only reported the metric with `flaml.tune.report` at the end of full training loop. It is possible to enable reporting of intermediate performance - allowing early stopping - as follows:\n",
    "\n",
    "- Huggingface provides _Callbacks_ which can be used to insert the `flaml.tune.report` call inside the training loop\n",
    "- Make sure to set `do_eval=True` in the `TrainingArguments` provided to `Trainer` and adjust the evaluation frequency accordingly"
   ],
   "cell_type": "markdown",
   "metadata": {}
  }
 ],
 "metadata": {
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3",
   "language": "python"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.9-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}