import pickle import pandas as pd import numpy as np from rank_bm25 import BM25Okapi from nltk.tokenize import word_tokenize import nltk import sys nltk.download('punkt') def load_data(file_path): with open(file_path, 'rb') as pkl_file: data = pickle.load(pkl_file) return data def save_texts(data, output_path='texts.npy'): np.save(output_path, np.array(data['Abstract'].values)) def load_texts(file_path='texts.npy'): return np.load(file_path, allow_pickle=True) def tokenize_texts(texts): return [word_tokenize(doc.lower()) for doc in texts] def build_bm25_model(tokenized_texts): return BM25Okapi(tokenized_texts) def save_bm25_model(bm25_model, output_path='bm25.pkl'): with open(output_path, 'wb') as pkl_file: pickle.dump(bm25_model, pkl_file) file_path="Your_dataset.pkl" data = load_data(file_path) save_texts(data) loaded_texts = load_texts() tokenized_texts = tokenize_texts(loaded_texts) bm25 = build_bm25_model(tokenized_texts) save_bm25_model(bm25)