from sentence_transformers import SentenceTransformer, losses
from torch.utils.data import DataLoader
def prepare_training_data():
"""Domain-specific query-document pairs"""
return [
("What is EBITDA?", "EBITDA (Earnings Before Interest, Taxes..."),
("Explain capital expenditure", "Capital expenditure (CapEx) refers to..."),
# ... thousands more pairs
]
def fine_tune_model():
"""Fine-tune on domain data"""
# Load base model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Prepare training data
train_examples = prepare_training_data()
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
# Define loss function
train_loss = losses.MultipleNegativesRankingLoss(model)
# Train
model.fit(
train_objectives=[(train_dataloader, train_loss)],
epochs=3,
warmup_steps=100
)
model.save('./fine_tuned_financial_model')
return model
# Use fine-tuned model
embedding_model = SentenceTransformer('./fine_tuned_financial_model')
No comments:
Post a Comment