LLM Science Exam

A ML pipeline for building a Large Language Model

Posted by Xinyao Wu on July 22, 2023

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer,AutoModelForMultipleChoice,TrainingArguments, Trainer
/opt/conda/lib/python3.10/site-packages/scipy/__init__.py:146: UserWarning: A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/__init__.py:98: UserWarning: unable to load libtensorflow_io_plugins.so: unable to open file: libtensorflow_io_plugins.so, from paths: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
  warnings.warn(f"unable to load libtensorflow_io_plugins.so: {e}")
/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/__init__.py:104: UserWarning: file system plugins are not loaded: unable to open file: libtensorflow_io.so, from paths: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']
  warnings.warn(f"file system plugins are not loaded: {e}")
# Following datacollator (adapted from https://huggingface.co/docs/transformers/tasks/multiple_choice)
# will dynamically pad our questions at batch-time so we don't have to make every question the length
# of our longest question.

from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch
# --- skeleton Requirement: can generate different predictions for different models in Hugging Face.

class LLM_prediction:

    def __init__(self,model_path,options = 'ABCDE'):
        self.model_path = model_path
        self.options = options
        self.indices = list(range(len(options)))
        self.option_to_index = {option: index for option, index in zip(self.options, self.indices)}
        self.index_to_option = {index: option for option, index in zip(self.options, self.indices)}
        return

    def read_data(self,data_folder = None):
        #training
        train_df = pd.read_csv(f"{data_folder}/train.csv")
        self.train_ds = Dataset.from_pandas(train_df)
        #testing
        self.test_df = pd.read_csv(f"{data_folder}/test.csv")
        return self.train_ds,self.test_df

    def pre_process_data(self,row):
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_path)
        question = [row['prompt']]*5
        answers = []
        for option in self.options:
            answers.append(row[option])
        tokenized_row = self.tokenizer(question,answers,truncation = True)
        tokenized_row['label'] = self.option_to_index[row['answer']]
        return tokenized_row

    def nlp(self,output_model_dir = 'finetuned_bert'):
        #return trainer
        model = AutoModelForMultipleChoice.from_pretrained(self.model_path)
        tokenized_train_ds = self.train_ds.map(self.pre_process_data, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
        training_args = TrainingArguments(
            output_dir=output_model_dir,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            load_best_model_at_end=True,
            learning_rate=5e-5,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            num_train_epochs=3,
            weight_decay=0.01,
            report_to='none'
        )
        self.trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train_ds,
            eval_dataset=tokenized_train_ds,
            tokenizer=self.tokenizer,
            data_collator=DataCollatorForMultipleChoice(tokenizer=self.tokenizer),
        )
        self.trainer.train()
        return self.trainer

    def predictions_to_map_output(self,predictions):
        sorted_answer_indices = np.argsort(-predictions)
        top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row
        top_answers = np.vectorize(self.index_to_option.get)(top_answer_indices)
        return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)

    def inference(self,assign_random_answer = True):
        if not assign_random_answer:
            raise ValueError('Another inference way has not been be developed.')
        self.test_df['answer']='A'
        self.test_ds = Dataset.from_pandas(self.test_df)
        tokenized_test_ds = self.test_ds.map(self.pre_process_data, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
        predictions=self.trainer.predict(tokenized_test_ds)
        return self.predictions_to_map_output(predictions.predictions)



llm_test = LLM_prediction(model_path = '/kaggle/input/huggingface-bert/bert-base-cased')
train_ds,test_ds = llm_test.read_data(data_folder ='/kaggle/input/kaggle-llm-science-exam')
llm_test.nlp()
Some weights of the model checkpoint at /kaggle/input/huggingface-bert/bert-base-cased were not used when initializing BertForMultipleChoice: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at /kaggle/input/huggingface-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

/opt/conda/lib/python3.10/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  warnings.warn(
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
  warnings.warn('Was asked to gather along dimension 0, but all '




<div>

  <progress value='75' max='75' style='width:300px; height:20px; vertical-align: middle;'></progress>
  [75/75 00:54, Epoch 3/3]
</div>
<table border="1" class="dataframe">
Epoch Training Loss Validation Loss 1 No log 1.480417 2 No log 1.163188 3 No log 1.066047

</table><p>

/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
  warnings.warn('Was asked to gather along dimension 0, but all '
/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
  warnings.warn('Was asked to gather along dimension 0, but all '





<transformers.trainer.Trainer at 0x7ebc65122440>
res = llm_test.inference()
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
  warnings.warn('Was asked to gather along dimension 0, but all '
submission_df = test_ds[['id']]
submission_df['prediction'] = res
/tmp/ipykernel_28/3749572995.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['prediction'] = res
submission_df.to_csv('submission.csv', index=False)