In [ ]:

Copied!





# !pip install packaging
# !pip install ninja
# !pip install flash-attn --no-build-isolation
# https://github.com/bdashore3/flash-attention/releases
# !pip install peft transformers datasets
# https://github.com/peremartra/Large-Language-Model-Notebooks-Course/blob/main/5-Fine%20Tuning/LoRA_Tuning_PEFT.ipynb
# !pip install packaging
# !pip install ninja
# !pip install flash-attn --no-build-isolation
# https://github.com/bdashore3/flash-attention/releases
# !pip install peft transformers datasets
# https://github.com/peremartra/Large-Language-Model-Notebooks-Course/blob/main/5-Fine%20Tuning/LoRA_Tuning_PEFT.ipynb

In [ ]:

Copied!





# Python      3.11.7
# GPU         4070TI 12GB
# Cuda        cuda_12.1.r12.1
# Library
# torch       2.2.2+cu121
# flash_attn  2.5.9.post1
# Python      3.11.7
# GPU         4070TI 12GB
# Cuda        cuda_12.1.r12.1
# Library
# torch       2.2.2+cu121
# flash_attn  2.5.9.post1

Load Model¶

In [ ]:

Copied!





from peft import (
    get_peft_model, 
    LoraConfig, 
    TaskType, 
    prepare_model_for_kbit_training
)
import transformers
import torch
import pickle
import time
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
from peft import (
    get_peft_model, 
    LoraConfig, 
    TaskType, 
    prepare_model_for_kbit_training
)
import transformers
import torch
import pickle
import time
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
)
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [ ]:

Copied!

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [ ]:

Copied!





compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_threshold=6.0,
        # llm_int8_skip_modules=None,
        # llm_int8_enable_fp32_cpu_offload=False,
        # llm_int8_has_fp16_weight=False,
    
        # load_in_4bit=True,
        # bnb_4bit_quant_type='nf4',
        # bnb_4bit_compute_dtype=compute_dtype,
        # bnb_4bit_use_double_quant=False,
    )
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_threshold=6.0,
        # llm_int8_skip_modules=None,
        # llm_int8_enable_fp32_cpu_offload=False,
        # llm_int8_has_fp16_weight=False,
    
        # load_in_4bit=True,
        # bnb_4bit_quant_type='nf4',
        # bnb_4bit_compute_dtype=compute_dtype,
        # bnb_4bit_use_double_quant=False,
    )

In [ ]:

Copied!





model_name_or_path = "microsoft/Phi-3-mini-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, 
    quantization_config=bnb_config,
    device_map="auto", 
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    use_cache=False
) # load the model
model_name_or_path = "microsoft/Phi-3-mini-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name_or_path, 
    quantization_config=bnb_config,
    device_map="auto", 
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    use_cache=False
) # load the model

In [ ]:

Copied!

model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

In [ ]:

Copied!

model
model

In [ ]:

Copied!





peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=32, 
    lora_alpha=16, 
    lora_dropout=0.1,
    # target_modules='all-linear'
    target_modules=["qkv_proj"] # optional, you can target specific layers using this
    # target_modules=["v_proj", "q_proj"]
) # create LoRA config for the finetuning

peft_model = get_peft_model(model, peft_config) # create a model ready for LoRA finetuning
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=32, 
    lora_alpha=16, 
    lora_dropout=0.1,
    # target_modules='all-linear'
    target_modules=["qkv_proj"] # optional, you can target specific layers using this
    # target_modules=["v_proj", "q_proj"]
) # create LoRA config for the finetuning

peft_model = get_peft_model(model, peft_config) # create a model ready for LoRA finetuning

In [ ]:

Copied!

peft_model
peft_model

In [ ]:

Copied!

peft_model.print_trainable_parameters()
peft_model.print_trainable_parameters()

Fine-Tuning¶

Load and Prepare data¶

In [ ]:

Copied!





with open('./dataset/full_formal_script_temp0.8.pkl', 'rb') as fp:
    data_formal = pickle.load(fp)

with open('./dataset/full_informal_script_temp0.8.pkl', 'rb') as fp:
    data_informal = pickle.load(fp)

with open('./dataset/full_novel_script_temp0.8.pkl', 'rb') as fp:
    data_novel = pickle.load(fp)

df = pd.read_csv('./dataset/raw.csv', index_col='uid')
# https://wandb.ai/capecape/alpaca_ft/reports/How-to-Fine-Tune-an-LLM-Part-1-Preparing-a-Dataset-for-Instruction-Tuning--Vmlldzo1NTcxNzE2
with open('./dataset/full_formal_script_temp0.8.pkl', 'rb') as fp:
    data_formal = pickle.load(fp)

with open('./dataset/full_informal_script_temp0.8.pkl', 'rb') as fp:
    data_informal = pickle.load(fp)

with open('./dataset/full_novel_script_temp0.8.pkl', 'rb') as fp:
    data_novel = pickle.load(fp)

df = pd.read_csv('./dataset/raw.csv', index_col='uid')
# https://wandb.ai/capecape/alpaca_ft/reports/How-to-Fine-Tune-an-LLM-Part-1-Preparing-a-Dataset-for-Instruction-Tuning--Vmlldzo1NTcxNzE2

In [ ]:

Copied!

df['json'] = df.apply(lambda x : {'name': x['name'], 'age': x['age'], 'job': x['job']}, axis=1)
df
df['json'] = df.apply(lambda x : {'name': x['name'], 'age': x['age'], 'job': x['job']}, axis=1)
df

In [ ]:

Copied!





for temp in data_formal:
    for uid in data_formal[temp]:
        df.loc[uid, 'formal'] = data_formal[temp][uid]

for temp in data_informal:
    for uid in data_informal[temp]:
        df.loc[uid, 'informal'] = data_informal[temp][uid]
        
for temp in data_novel:
    for uid in data_novel[temp]:
        df.loc[uid, 'novel'] = data_novel[temp][uid]
for temp in data_formal:
    for uid in data_formal[temp]:
        df.loc[uid, 'formal'] = data_formal[temp][uid]

for temp in data_informal:
    for uid in data_informal[temp]:
        df.loc[uid, 'informal'] = data_informal[temp][uid]
        
for temp in data_novel:
    for uid in data_novel[temp]:
        df.loc[uid, 'novel'] = data_novel[temp][uid]

In [ ]:

Copied!

train_ratio = 0.9
index = int(len(df)*train_ratio)
train_df, test_df = df[:index], df[index:]
train_ratio = 0.9
index = int(len(df)*train_ratio)
train_df, test_df = df[:index], df[index:]

In [ ]:

Copied!





train_prep_df = train_df.reset_index()[['uid', 'json', 'formal', 'informal', 'novel']].melt(
    id_vars=['uid','json'],
    var_name="type",
    value_name="context"
).sort_values('uid')
train_prep_df = train_prep_df[['json', 'context']]

test_prep_df = test_df.reset_index()[['uid', 'json', 'formal', 'informal', 'novel']].melt(
    id_vars=['uid','json'],
    var_name="type",
    value_name="context"
).sort_values('uid')
test_prep_df = test_prep_df[['json', 'context']]
train_prep_df = train_df.reset_index()[['uid', 'json', 'formal', 'informal', 'novel']].melt(
    id_vars=['uid','json'],
    var_name="type",
    value_name="context"
).sort_values('uid')
train_prep_df = train_prep_df[['json', 'context']]

test_prep_df = test_df.reset_index()[['uid', 'json', 'formal', 'informal', 'novel']].melt(
    id_vars=['uid','json'],
    var_name="type",
    value_name="context"
).sort_values('uid')
test_prep_df = test_prep_df[['json', 'context']]

In [ ]:

Copied!





# https://stackoverflow.com/questions/67852880/how-can-i-handle-this-datasets-to-create-a-datasetdict
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_prep_df, preserve_index=False),
    'test': Dataset.from_pandas(test_prep_df, preserve_index=False)
})
# https://stackoverflow.com/questions/67852880/how-can-i-handle-this-datasets-to-create-a-datasetdict
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_prep_df, preserve_index=False),
    'test': Dataset.from_pandas(test_prep_df, preserve_index=False)
})

In [ ]:

Copied!

dataset
dataset

Test the Model with Zero Shot Inferencing¶

In [ ]:

Copied!

token_fn = AutoTokenizer.from_pretrained(model_name_or_path)
token_fn = AutoTokenizer.from_pretrained(model_name_or_path)

In [ ]:

Copied!





def create_prompt_formats(sample, add_result=False):
    ################# Version = 1 ################
#     prompt = f"""You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability.
# Extract name, age, job from the sentence to json format. if the information doesn't exsits fill null.
# Sentence: '{sample['context']}'
# """
#     if add_result:
#         prompt += f"{sample['json']}"

    ################# Version = 2 ################
    
    prompt = f"""<|system|>
You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability.<|end|>
<|user|>
Extract name, age, job from the sentence to json format. if the information doesn't exsits fill null.

Sentence: '{sample['context']}'<|end|>
<|assistant|>
"""
    prompt += f"{sample['json']}<|end|>"
    sample["text"] = prompt
    # sample["json"] = str(sample['json'])
    
    return sample
def create_prompt_formats(sample, add_result=False):
    ################# Version = 1 ################
#     prompt = f"""You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability.
# Extract name, age, job from the sentence to json format. if the information doesn't exsits fill null.
# Sentence: '{sample['context']}'
# """
#     if add_result:
#         prompt += f"{sample['json']}"

    ################# Version = 2 ################
    
    prompt = f"""<|system|>
You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability.<|end|>
<|user|>
Extract name, age, job from the sentence to json format. if the information doesn't exsits fill null.

Sentence: '{sample['context']}'<|end|>
<|assistant|>
"""
    prompt += f"{sample['json']}<|end|>"
    sample["text"] = prompt
    # sample["json"] = str(sample['json'])
    
    return sample

In [ ]:

Copied!

train_dataset = dataset['train'].map(create_prompt_formats, fn_kwargs={'add_result': True}, remove_columns=['json', 'context'])
eval_dataset = dataset['test'].map(create_prompt_formats, fn_kwargs={'add_result': False}, remove_columns=['json', 'context'])
train_dataset = dataset['train'].map(create_prompt_formats, fn_kwargs={'add_result': True}, remove_columns=['json', 'context'])
eval_dataset = dataset['test'].map(create_prompt_formats, fn_kwargs={'add_result': False}, remove_columns=['json', 'context'])

In [ ]:

Copied!

train_dataset
train_dataset

In [ ]:

Copied!





def tokenize_function(examples):
    return token_fn(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=512
    )

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
def tokenize_function(examples):
    return token_fn(
        examples["text"], 
        truncation=True, 
        padding="max_length", 
        max_length=512
    )

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

In [ ]:

Copied!

tokenized_train_dataset
tokenized_train_dataset

Training model¶

In [ ]:

Copied!





import inspect
model_to_inspect = peft_model.get_base_model()
signature = inspect.signature(model_to_inspect.forward)
list(signature.parameters.keys())
import inspect
model_to_inspect = peft_model.get_base_model()
signature = inspect.signature(model_to_inspect.forward)
list(signature.parameters.keys())

In [ ]:

Copied!





output_dir = f'./Phi-3-mini-4k-instruct-8Blora-text2json-training-clean-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,

    gradient_accumulation_steps=4,
    save_steps=10,
    save_total_limit=50,
    logging_steps=1,
    
    learning_rate=1e-3,
    weight_decay=0.01,
    remove_unused_columns=True,
    fp16=True
)
output_dir = f'./Phi-3-mini-4k-instruct-8Blora-text2json-training-clean-{str(int(time.time()))}'

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,

    gradient_accumulation_steps=4,
    save_steps=10,
    save_total_limit=50,
    logging_steps=1,
    
    learning_rate=1e-3,
    weight_decay=0.01,
    remove_unused_columns=True,
    fp16=True
)

In [ ]:

Copied!





trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=transformers.DataCollatorForLanguageModeling(token_fn, mlm=False)
)
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=transformers.DataCollatorForLanguageModeling(token_fn, mlm=False)
)

In [ ]:

Copied!





try:
    trainer.train()
except RuntimeError as e:
    print(f"Error during training: {e}")
    print("Attempting to continue training on CPU...")
    device = torch.device("cpu")
    model = model.to(device)
    training_args.fp16 = False
    training_args.per_device_train_batch_size = 1
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )
    trainer.train()
try:
    trainer.train()
except RuntimeError as e:
    print(f"Error during training: {e}")
    print("Attempting to continue training on CPU...")
    device = torch.device("cpu")
    model = model.to(device)
    training_args.fp16 = False
    training_args.per_device_train_batch_size = 1
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        data_collator=data_collator,
    )
    trainer.train()

Save model¶

In [ ]:

Copied!

peft_model.save_pretrained('./Tuning/checkpoint/Phi-3-mini-4k-instruct')
peft_model.save_pretrained('./Tuning/checkpoint/Phi-3-mini-4k-instruct')

In [ ]:

Copied!

token_fn.save_pretrained('./Tuning/checkpoint/Phi-3-mini-4k-instruct/tokenize')
token_fn.save_pretrained('./Tuning/checkpoint/Phi-3-mini-4k-instruct/tokenize')

Test Call finetuned model¶

In [ ]:

Copied!





def create_prompt_formats_eval(sample):
    prompt = f"""<|system|>
You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability.<|end|>
<|user|>
Extract name, age, job from the sentence to json format. if the information doesn't exsits fill null.

Sentence: '{sample['context']}'<|end|>
<|assistant|>
"""
    sample["text"] = prompt
    
    return sample
def create_prompt_formats_eval(sample):
    prompt = f"""<|system|>
You are a helpful, smart, kind, and efficient AI assistant. You always fulfill the user's requests to the best of your ability.<|end|>
<|user|>
Extract name, age, job from the sentence to json format. if the information doesn't exsits fill null.

Sentence: '{sample['context']}'<|end|>
<|assistant|>
"""
    sample["text"] = prompt
    
    return sample

In [ ]:

Copied!





index = 1

inputs = token_fn(
    eval[index]['text'], 
    truncation=True, 
    padding="max_length", 
    max_length=512,
    return_tensors="pt"
).to(peft_model.device)
        
# Generate the prediction
outputs = peft_model.generate(**inputs, max_new_tokens=512)

# # Decode the output
predicted_text = token_fn.decode(outputs[0], skip_special_tokens=True)

result = re.findall('{.*}', predicted_text)[0]
index = 1

inputs = token_fn(
    eval[index]['text'], 
    truncation=True, 
    padding="max_length", 
    max_length=512,
    return_tensors="pt"
).to(peft_model.device)
        
# Generate the prediction
outputs = peft_model.generate(**inputs, max_new_tokens=512)

# # Decode the output
predicted_text = token_fn.decode(outputs[0], skip_special_tokens=True)

result = re.findall('{.*}', predicted_text)[0]