Spaces:
Sleeping
Sleeping
| import pickle | |
| import pandas as pd | |
| import numpy as np | |
| from rank_bm25 import BM25Okapi | |
| from nltk.tokenize import word_tokenize | |
| import nltk | |
| import sys | |
| nltk.download('punkt') | |
| def load_data(file_path): | |
| with open(file_path, 'rb') as pkl_file: | |
| data = pickle.load(pkl_file) | |
| return data | |
| def save_texts(data, output_path='texts.npy'): | |
| np.save(output_path, np.array(data['Abstract'].values)) | |
| def load_texts(file_path='texts.npy'): | |
| return np.load(file_path, allow_pickle=True) | |
| def tokenize_texts(texts): | |
| return [word_tokenize(doc.lower()) for doc in texts] | |
| def build_bm25_model(tokenized_texts): | |
| return BM25Okapi(tokenized_texts) | |
| def save_bm25_model(bm25_model, output_path='bm25.pkl'): | |
| with open(output_path, 'wb') as pkl_file: | |
| pickle.dump(bm25_model, pkl_file) | |
| file_path="Your_dataset.pkl" | |
| data = load_data(file_path) | |
| save_texts(data) | |
| loaded_texts = load_texts() | |
| tokenized_texts = tokenize_texts(loaded_texts) | |
| bm25 = build_bm25_model(tokenized_texts) | |
| save_bm25_model(bm25) | |