Spaces:
Sleeping
Sleeping
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| from tqdm import tqdm | |
| import json | |
| # MODEL_PATH = "/nfs/maziyang.mzy/models/vicuna-7b-v1.5" | |
| MODEL_PATH = "/nfs/zhifu.gzf/ckpt/Llama-2-7b-hf" | |
| # MODEL_PATH = "/nfs/maziyang.mzy/models/Llama-2-7b-chat-hf" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH) | |
| model = AutoModelForCausalLM.from_pretrained(MODEL_PATH) | |
| device = 'cuda:7' | |
| model.to(device) | |
| model.eval() | |
| corpus_path = "/nfs/maziyang.mzy/data/librispeech/librispeech_test_clean_filtered.jsonl" | |
| corpus = [] | |
| with open(corpus_path, encoding='utf-8') as fin: | |
| for line in fin: | |
| data_dict = json.loads(line.strip()) | |
| corpus.append(data_dict.get("target", None)) | |
| cumulative_log_likelihood = 0 | |
| total_tokens = 0 | |
| for sentence in tqdm(corpus): | |
| inputs = tokenizer(sentence.strip().lower(), return_tensors="pt").to(device) | |
| input_ids = inputs["input_ids"] | |
| # input_len = input_ids.size(1) | |
| input_len = len(sentence.split(" ")) | |
| total_tokens += input_len | |
| with torch.no_grad(): | |
| outputs = model(**inputs, labels=input_ids) | |
| log_likelihood = outputs.loss * input_len | |
| cumulative_log_likelihood += log_likelihood.item() | |
| average_log_likelihood = cumulative_log_likelihood / total_tokens | |
| corpus_ppl = torch.exp(torch.tensor(average_log_likelihood)).item() | |
| print(f"Model: {MODEL_PATH}") | |
| print(f"Corpus: {corpus_path}") | |
| print(f"Corpus Perplexity: {corpus_ppl}") | |