Instructions to use remiai3/text-to-code-using-codegen-project_guide with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use remiai3/text-to-code-using-codegen-project_guide with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("remiai3/text-to-code-using-codegen-project_guide", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| import os | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback | |
| from datasets import load_dataset | |
| import matplotlib.pyplot as plt | |
| # Set Hugging Face token (replace with your actual token) | |
| os.environ["HF_TOKEN"] = "hf_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" | |
| # Recommended for download stability, if you had issues before | |
| os.environ["HF_HUB_DOWNLOAD_TIMEOUT"] = "600" # 10 minutes timeout | |
| os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # Enable robust downloader | |
| # Download model and tokenizer | |
| model_name = "Salesforce/codegen-350M-multi" | |
| local_model_path = "./codegen_model" | |
| print(f"Attempting to download/load tokenizer from {model_name} to {local_model_path}...") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=local_model_path) | |
| print("Tokenizer loaded.") | |
| print(f"Attempting to download/load model from {model_name} to {local_model_path}...") | |
| # Removed torch_dtype=torch.float16 as it's typically for GPU and might not help on CPU | |
| # and could even cause unexpected behavior on some CPU setups. | |
| model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=local_model_path) | |
| print("Model loaded.") | |
| # Set padding token | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Move model to CPU | |
| device = torch.device("cpu") | |
| model.to(device) | |
| print(f"Model moved to {device}.") | |
| # Load custom dataset from JSONL file | |
| dataset_file = "custom_dataset.jsonl" | |
| print(f"Loading dataset from {dataset_file}...") | |
| dataset = load_dataset('json', data_files=dataset_file, split='train') | |
| print("Dataset loaded.") | |
| print(f"Dataset size: {len(dataset)} examples.") | |
| print(f"First example of dataset: {dataset[0]}") # Print first example to check data format | |
| # Tokenize dataset | |
| def tokenize_function(examples): | |
| inputs = [f"{prompt}\n{code}" for prompt, code in zip(examples["prompt"], examples["code"])] | |
| # --- REDUCED MAX_LENGTH TO SAVE MEMORY --- | |
| return tokenizer(inputs, truncation=True, padding="max_length", max_length=64) # Try 64 or even 32 if 128 is too much | |
| print("Tokenizing dataset...") | |
| tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["prompt", "code"]) | |
| print("Dataset tokenized.") | |
| print(f"First tokenized example: {tokenized_dataset[0]}") | |
| # Data collator for language modeling | |
| data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
| # Define training arguments | |
| training_args = TrainingArguments( | |
| output_dir="./finetuned_codegen", | |
| overwrite_output_dir=True, | |
| num_train_epochs=3, | |
| # --- AGGRESSIVELY REDUCED BATCH SIZE AND GRADIENT ACCUMULATION FOR CPU --- | |
| per_device_train_batch_size=1, | |
| gradient_accumulation_steps=1, # No accumulation, true batch size of 1 | |
| save_steps=500, | |
| save_total_limit=2, | |
| logging_steps=10, # Log more frequently to see if it starts moving | |
| learning_rate=5e-5, | |
| fp16=False, # Keep False for CPU | |
| use_cpu=True, # Use this instead of no_cuda=True | |
| dataloader_pin_memory=False, # Disable pin_memory for CPU | |
| report_to="none", # Disable reporting to avoid potential hangs | |
| gradient_checkpointing=True, # Keep this, it helps with memory on CPU too | |
| max_grad_norm=1.0, | |
| ) | |
| # Custom callback to store training loss | |
| class LossCallback(TrainerCallback): | |
| def __init__(self): | |
| self.losses = [] | |
| self.log_steps = [] | |
| def on_log(self, args, state, control, logs=None, **kwargs): | |
| if logs and "loss" in logs: | |
| self.losses.append(logs["loss"]) | |
| self.log_steps.append(state.global_step) | |
| print(f"Step {state.global_step}: Loss = {logs['loss']:.4f}") | |
| loss_callback = LossCallback() | |
| # Initialize Trainer | |
| trainer = Trainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=tokenized_dataset, | |
| data_collator=data_collator, | |
| callbacks=[loss_callback], | |
| ) | |
| # Start fine-tuning | |
| print("Starting fine-tuning...") | |
| print("WARNING: Training on CPU will be extremely slow. The 0% progress bar might take a very long time to update.") | |
| print("Please monitor your system's RAM and CPU usage.") | |
| trainer.train() | |
| print("Fine-tuning finished.") | |
| # Save fine-tuned model | |
| model.save_pretrained("./finetuned_codegen") | |
| tokenizer.save_pretrained("./finetuned_codegen") | |
| print("Model fine-tuned and saved to ./finetuned_codegen.") | |
| # Plot training loss | |
| if loss_callback.losses: | |
| plt.figure(figsize=(10, 6)) | |
| plt.plot(loss_callback.log_steps, loss_callback.losses, label="Training Loss") | |
| plt.xlabel("Steps") | |
| plt.ylabel("Loss") | |
| plt.title("Fine-Tuning Loss Curve") | |
| plt.legend() | |
| plt.grid(True) | |
| plot_path = "./finetuned_codegen/loss_plot.png" | |
| plt.savefig(plot_path) | |
| print(f"Loss plot saved to {plot_path}") | |
| else: | |
| print("No training losses recorded to plot.") | |
| plt.show() | |
| print("Fine-tuning script finished execution.") |