aaryan3781 commited on
Commit
458a791
·
1 Parent(s): 2d2fbde

Update abstract.py

Browse files
Files changed (1) hide show
  1. abstract.py +30 -10
abstract.py CHANGED
@@ -4,21 +4,41 @@ import numpy as np
4
  from rank_bm25 import BM25Okapi
5
  from nltk.tokenize import word_tokenize
6
  import nltk
 
 
7
  nltk.download('punkt')
8
 
9
- # Load data from the PKL file
10
- with open('data_hydrogen.pkl', 'rb') as pkl_file:
11
- data = pickle.load(pkl_file)
 
 
 
 
 
 
 
12
 
13
- # save texts as .npy file
14
- np.save('texts.npy', np.array(data['Abstract'].values))
15
 
 
 
16
 
17
- loaded_texts = np.load('texts.npy', allow_pickle=True)
 
 
18
 
19
- tokenized_texts = [word_tokenize(doc.lower()) for doc in loaded_texts]
20
- bm25 = BM25Okapi(tokenized_texts)
 
 
21
 
22
- with open('bm25.pkl', 'wb') as pkl_file:
23
- pickle.dump(bm25, pkl_file)
24
 
 
 
 
 
 
 
 
4
  from rank_bm25 import BM25Okapi
5
  from nltk.tokenize import word_tokenize
6
  import nltk
7
+ import sys
8
+
9
  nltk.download('punkt')
10
 
11
+ def load_data(file_path):
12
+ with open(file_path, 'rb') as pkl_file:
13
+ data = pickle.load(pkl_file)
14
+ return data
15
+
16
+ def save_texts(data, output_path='texts.npy'):
17
+ np.save(output_path, np.array(data['Abstract'].values))
18
+
19
+ def load_texts(file_path='texts.npy'):
20
+ return np.load(file_path, allow_pickle=True)
21
 
22
+ def tokenize_texts(texts):
23
+ return [word_tokenize(doc.lower()) for doc in texts]
24
 
25
+ def build_bm25_model(tokenized_texts):
26
+ return BM25Okapi(tokenized_texts)
27
 
28
+ def save_bm25_model(bm25_model, output_path='bm25.pkl'):
29
+ with open(output_path, 'wb') as pkl_file:
30
+ pickle.dump(bm25_model, pkl_file)
31
 
32
+ if __name__ == "__main__":
33
+ if len(sys.argv) != 2:
34
+ print("Usage: python script.py <file_path>")
35
+ sys.exit(1)
36
 
37
+ file_path = sys.argv[1]
 
38
 
39
+ data = load_data(file_path)
40
+ save_texts(data)
41
+ loaded_texts = load_texts()
42
+ tokenized_texts = tokenize_texts(loaded_texts)
43
+ bm25 = build_bm25_model(tokenized_texts)
44
+ save_bm25_model(bm25)