BTP_chatbot / abstract.py
aaryan3781's picture
Update abstract.py
2e896f5
import pickle
import pandas as pd
import numpy as np
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
import sys
nltk.download('punkt')
def load_data(file_path):
with open(file_path, 'rb') as pkl_file:
data = pickle.load(pkl_file)
return data
def save_texts(data, output_path='texts.npy'):
np.save(output_path, np.array(data['Abstract'].values))
def load_texts(file_path='texts.npy'):
return np.load(file_path, allow_pickle=True)
def tokenize_texts(texts):
return [word_tokenize(doc.lower()) for doc in texts]
def build_bm25_model(tokenized_texts):
return BM25Okapi(tokenized_texts)
def save_bm25_model(bm25_model, output_path='bm25.pkl'):
with open(output_path, 'wb') as pkl_file:
pickle.dump(bm25_model, pkl_file)
file_path="Your_dataset.pkl"
data = load_data(file_path)
save_texts(data)
loaded_texts = load_texts()
tokenized_texts = tokenize_texts(loaded_texts)
bm25 = build_bm25_model(tokenized_texts)
save_bm25_model(bm25)