Spaces:
Runtime error
Runtime error
load sequence
Browse files- app.py +135 -73
- yt_stats.py +2 -2
app.py
CHANGED
|
@@ -156,6 +156,96 @@ def get_extracted_text(raw_text):
|
|
| 156 |
def get_extracted_text_to_dict(raw_text):
|
| 157 |
st.session_state['extract'] = [raw_text,0,0,0,0]
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
#######################################################################################
|
| 161 |
# Application Start
|
|
@@ -247,19 +337,12 @@ st.write("")
|
|
| 247 |
# Load Transcript
|
| 248 |
###########################
|
| 249 |
|
| 250 |
-
|
| 251 |
|
| 252 |
-
|
| 253 |
-
transcript_item = transcript_list.find_transcript(['en'])
|
| 254 |
-
transcript_item_is_generated = transcript_item.is_generated
|
| 255 |
-
transcript_raw = transcript_item.fetch()
|
| 256 |
-
|
| 257 |
-
if transcript_raw is None:
|
| 258 |
st.error("No transcript available.")
|
| 259 |
st.stop()
|
| 260 |
|
| 261 |
-
transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw])
|
| 262 |
-
|
| 263 |
########################
|
| 264 |
# Load Author Keywords, that are not viewable by users
|
| 265 |
########################
|
|
@@ -356,26 +439,7 @@ if st.button('Extract Sentences'):
|
|
| 356 |
st.error('Please run extraction first.', icon="🚨")
|
| 357 |
else:
|
| 358 |
|
| 359 |
-
|
| 360 |
-
yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
|
| 361 |
-
yt_img_html_link = '<a href='+url+'>'+yt_img_html+'</a>'
|
| 362 |
-
video_info = {'ID': [video_id],
|
| 363 |
-
'Video':[yt_img_html_link],
|
| 364 |
-
'Author': [st.session_state["video_data"]["Author"][0]],
|
| 365 |
-
'Channel':[st.session_state["channel_id"]],
|
| 366 |
-
'Title': [st.session_state["video_data"]["Title"][0]],
|
| 367 |
-
'Published': [st.session_state["video_data"]["Published"][0]],
|
| 368 |
-
'Views':[st.session_state["video_data"]["Views"][0]],
|
| 369 |
-
'Length':[st.session_state["video_data"]["Length"][0]],
|
| 370 |
-
'Keywords':['; '.join(st.session_state["keywords"])]}
|
| 371 |
-
|
| 372 |
-
transcript_info = {'Words':[int(st.session_state.extract[1])],
|
| 373 |
-
'Sentences': [int(st.session_state.extract[2])],
|
| 374 |
-
'Characters': [int(st.session_state.extract[3])],
|
| 375 |
-
'Tokens':[int(st.session_state.extract[4])],
|
| 376 |
-
'Lextext':[st.session_state.extract[0]],
|
| 377 |
-
'GPTSummary':[0]}
|
| 378 |
-
df_current_ts = pd.DataFrame({**video_info,**transcript_info})
|
| 379 |
|
| 380 |
# initial write.
|
| 381 |
#df_new_sheet = pd.concat([df_current_ts])
|
|
@@ -473,64 +537,62 @@ st.write("")
|
|
| 473 |
|
| 474 |
if st.button('Load Videos'):
|
| 475 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
progress_text = 'Loading...'
|
| 477 |
loading_bar = st.progress(0, text=progress_text)
|
| 478 |
item_limit=3
|
| 479 |
-
|
|
|
|
| 480 |
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
|
| 485 |
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
vids_lengths = []
|
| 490 |
-
vids_published= []
|
| 491 |
-
vids_views= []
|
| 492 |
-
item=0
|
| 493 |
-
for video in yt.video_data:
|
| 494 |
-
if item == item_limit:
|
| 495 |
-
break
|
| 496 |
-
item = item+1
|
| 497 |
|
| 498 |
-
vids_video_id = video
|
| 499 |
-
vids_url = 'https://www.youtube.com/watch?v='+vids_video_id
|
| 500 |
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
yt_img_html_link = '<a href='+vids_url+'>'+yt_img_html+'</a>'
|
| 504 |
-
vids_thumbnails.append(yt_img_html_link)
|
| 505 |
-
|
| 506 |
-
vids_video_id_link = '<a target="_self" href="/?vid='+vids_video_id+'">'+vids_video_id+'</a>'
|
| 507 |
-
vids_videoIds.append(vids_video_id_link)
|
| 508 |
|
| 509 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
vids_published.append(yt_publish_date_formatted)
|
| 519 |
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
|
|
|
|
|
|
| 523 |
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
'Views':vids_views,
|
| 529 |
-
'Length':vids_lengths}
|
| 530 |
|
|
|
|
| 531 |
|
| 532 |
-
|
| 533 |
-
|
| 534 |
|
| 535 |
|
| 536 |
|
|
|
|
| 156 |
def get_extracted_text_to_dict(raw_text):
|
| 157 |
st.session_state['extract'] = [raw_text,0,0,0,0]
|
| 158 |
|
| 159 |
+
def get_videos_from_yt(yt):
|
| 160 |
+
|
| 161 |
+
vids_thumbnails = []
|
| 162 |
+
vids_videoIds = []
|
| 163 |
+
vids_titles = []
|
| 164 |
+
vids_lengths = []
|
| 165 |
+
vids_published= []
|
| 166 |
+
vids_views= []
|
| 167 |
+
item=0
|
| 168 |
+
for video in yt.video_data:
|
| 169 |
+
if item == item_limit:
|
| 170 |
+
break
|
| 171 |
+
item = item+1
|
| 172 |
+
|
| 173 |
+
vids_video_id = video
|
| 174 |
+
vids_url = 'https://www.youtube.com/watch?v='+vids_video_id
|
| 175 |
+
|
| 176 |
+
yt_img = f'http://img.youtube.com/vi/{vids_video_id}/mqdefault.jpg'
|
| 177 |
+
yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
|
| 178 |
+
yt_img_html_link = '<a href='+vids_url+'>'+yt_img_html+'</a>'
|
| 179 |
+
vids_thumbnails.append(yt_img_html_link)
|
| 180 |
+
|
| 181 |
+
vids_video_id_link = '<a target="_self" href="/?vid='+vids_video_id+'">'+vids_video_id+'</a>'
|
| 182 |
+
vids_videoIds.append(vids_video_id_link)
|
| 183 |
+
|
| 184 |
+
vids_titles.append(yt.video_data[video]['title'])
|
| 185 |
+
|
| 186 |
+
yt_length = yt.video_data[video]['duration']
|
| 187 |
+
yt_length_isodate = isodate.parse_duration(yt_length)
|
| 188 |
+
yt_length_isoformat = isodate.duration_isoformat(yt_length_isodate, "%H:%M:%S")[1:]
|
| 189 |
+
vids_lengths.append(yt_length_isoformat)
|
| 190 |
+
|
| 191 |
+
yt_publish_date = yt.video_data[video]['publishedAt']
|
| 192 |
+
yt_publish_date_formatted = datetime.strptime(yt_publish_date, '%Y-%m-%dT%H:%M:%SZ').strftime('%B %d, %Y')
|
| 193 |
+
vids_published.append(yt_publish_date_formatted)
|
| 194 |
+
|
| 195 |
+
yt_views = yt.video_data[video]['viewCount']
|
| 196 |
+
yt_viws_formatted = format(int(yt_views), ",").replace(",", "'")
|
| 197 |
+
vids_views.append(yt_viws_formatted)
|
| 198 |
+
|
| 199 |
+
df_videos = {'Video': vids_thumbnails,
|
| 200 |
+
'Video ID':vids_videoIds,
|
| 201 |
+
'Title':vids_titles,
|
| 202 |
+
'Published':vids_published,
|
| 203 |
+
'Views':vids_views,
|
| 204 |
+
'Length':vids_lengths}
|
| 205 |
+
|
| 206 |
+
return df_videos
|
| 207 |
+
|
| 208 |
+
def get_transcript(video_id):
|
| 209 |
+
|
| 210 |
+
transcript_list = yta.list_transcripts(video_id)
|
| 211 |
+
|
| 212 |
+
transcript_raw = None
|
| 213 |
+
transcript_item = transcript_list.find_transcript(['en'])
|
| 214 |
+
transcript_item_is_generated = transcript_item.is_generated
|
| 215 |
+
transcript_raw = transcript_item.fetch()
|
| 216 |
+
|
| 217 |
+
if transcript_raw is None:
|
| 218 |
+
return None
|
| 219 |
+
|
| 220 |
+
transcript_text = '\n'.join([i['text'].replace('\n',' ') for i in transcript_raw])
|
| 221 |
+
|
| 222 |
+
return transcript_text, transcript_item_is_generated
|
| 223 |
+
|
| 224 |
+
def get_meta_info(video_id, url):
|
| 225 |
+
|
| 226 |
+
yt_img = f'http://img.youtube.com/vi/{video_id}/mqdefault.jpg'
|
| 227 |
+
yt_img_html = '<img src='+yt_img+' width="250" height="150" />'
|
| 228 |
+
yt_img_html_link = '<a href='+url+'>'+yt_img_html+'</a>'
|
| 229 |
+
video_info = {'ID': [video_id],
|
| 230 |
+
'Video':[yt_img_html_link],
|
| 231 |
+
'Author': [st.session_state["video_data"]["Author"][0]],
|
| 232 |
+
'Channel':[st.session_state["channel_id"]],
|
| 233 |
+
'Title': [st.session_state["video_data"]["Title"][0]],
|
| 234 |
+
'Published': [st.session_state["video_data"]["Published"][0]],
|
| 235 |
+
'Views':[st.session_state["video_data"]["Views"][0]],
|
| 236 |
+
'Length':[st.session_state["video_data"]["Length"][0]],
|
| 237 |
+
'Keywords':['; '.join(st.session_state["keywords"])]}
|
| 238 |
+
|
| 239 |
+
transcript_info = {'Words':[int(st.session_state.extract[1])],
|
| 240 |
+
'Sentences': [int(st.session_state.extract[2])],
|
| 241 |
+
'Characters': [int(st.session_state.extract[3])],
|
| 242 |
+
'Tokens':[int(st.session_state.extract[4])],
|
| 243 |
+
'Lextext':[st.session_state.extract[0]],
|
| 244 |
+
'GPTSummary':[0]}
|
| 245 |
+
df_current_ts = pd.DataFrame({**video_info,**transcript_info})
|
| 246 |
+
|
| 247 |
+
return df_current_ts
|
| 248 |
+
|
| 249 |
|
| 250 |
#######################################################################################
|
| 251 |
# Application Start
|
|
|
|
| 337 |
# Load Transcript
|
| 338 |
###########################
|
| 339 |
|
| 340 |
+
transcript_text, transcript_item_is_generated = get_transcript(video_id)
|
| 341 |
|
| 342 |
+
if transcript_text is None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
st.error("No transcript available.")
|
| 344 |
st.stop()
|
| 345 |
|
|
|
|
|
|
|
| 346 |
########################
|
| 347 |
# Load Author Keywords, that are not viewable by users
|
| 348 |
########################
|
|
|
|
| 439 |
st.error('Please run extraction first.', icon="🚨")
|
| 440 |
else:
|
| 441 |
|
| 442 |
+
df_current_ts = get_meta_info(video_id, url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
|
| 444 |
# initial write.
|
| 445 |
#df_new_sheet = pd.concat([df_current_ts])
|
|
|
|
| 537 |
|
| 538 |
if st.button('Load Videos'):
|
| 539 |
|
| 540 |
+
if 'gsheed' not in st.session_state:
|
| 541 |
+
df = mysheet.read_gspread()
|
| 542 |
+
st.session_state.gsheed = df
|
| 543 |
+
|
| 544 |
progress_text = 'Loading...'
|
| 545 |
loading_bar = st.progress(0, text=progress_text)
|
| 546 |
item_limit=3
|
| 547 |
+
df = st.session_state.gsheed
|
| 548 |
+
yt.get_channel_video_data(st.session_state["channel_id"],df, loading_bar, progress_text, item_limit)
|
| 549 |
|
| 550 |
+
df_videos = get_videos_from_yt(yt)
|
| 551 |
+
dataset = pd.DataFrame(df_videos)
|
| 552 |
+
st.markdown(dataset.style.hide(axis="index").to_html(), unsafe_allow_html=True)
|
| 553 |
|
| 554 |
|
| 555 |
+
########################
|
| 556 |
+
# Sequence Loader
|
| 557 |
+
########################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 558 |
|
|
|
|
|
|
|
| 559 |
|
| 560 |
+
st.subheader("Sequence Loader")
|
| 561 |
+
# input hash as secret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 562 |
|
| 563 |
+
input_hash = st.text_input("Enter Hash:")
|
| 564 |
+
if st.button('Load Sequence'):
|
| 565 |
+
HASH_KEY = st.secrets["hash_key"]
|
| 566 |
+
if input_hash == HASH_KEY:
|
| 567 |
+
st.write("Access granted")
|
| 568 |
+
# read in spreadsheet
|
| 569 |
+
if 'gsheed' not in st.session_state:
|
| 570 |
+
df = mysheet.read_gspread()
|
| 571 |
+
st.session_state.gsheed = df
|
| 572 |
|
| 573 |
+
progress_text = 'Loading...'
|
| 574 |
+
loading_bar = st.progress(0, text=progress_text)
|
| 575 |
+
item_limit=3
|
| 576 |
+
df = st.session_state.gsheed
|
| 577 |
+
yt.get_channel_video_data(st.session_state["channel_id"], df,loading_bar, progress_text, item_limit)
|
| 578 |
+
df_videos = get_videos_from_yt(yt)
|
| 579 |
+
dataset = pd.DataFrame(df_videos)
|
|
|
|
| 580 |
|
| 581 |
+
for sng in dataset['Video ID']:
|
| 582 |
+
subsng = sng[sng.find('>')+1:sng.find('</')]
|
| 583 |
+
print(subsng)
|
| 584 |
+
|
| 585 |
+
transcript_text, transcript_item_is_generated = get_transcript(subsng)
|
| 586 |
|
| 587 |
+
if transcript_item_is_generated:
|
| 588 |
+
get_punctuated_text(transcript_text)
|
| 589 |
+
else:
|
| 590 |
+
get_punctuated_text_to_dict(transcript_text)
|
|
|
|
|
|
|
| 591 |
|
| 592 |
+
get_extracted_text(st.session_state.punkt[0])
|
| 593 |
|
| 594 |
+
else:
|
| 595 |
+
st.write("Access denied")
|
| 596 |
|
| 597 |
|
| 598 |
|
yt_stats.py
CHANGED
|
@@ -33,7 +33,7 @@ class YTstats:
|
|
| 33 |
#pbar.close()
|
| 34 |
return data
|
| 35 |
|
| 36 |
-
def get_channel_video_data(self, channel_id, loading_bar, progress_text, item_limit=3):
|
| 37 |
"Extract all video information of the channel"
|
| 38 |
print('get video data...')
|
| 39 |
channel_videos, channel_playlists = self._get_channel_content(channel_id, limit=50)
|
|
@@ -61,7 +61,7 @@ class YTstats:
|
|
| 61 |
duration = isodate.parse_duration(channel_videos[video_id]['duration'])
|
| 62 |
short_duration = isodate.parse_duration('PT4M')
|
| 63 |
|
| 64 |
-
if duration > short_duration:
|
| 65 |
item = item+1
|
| 66 |
step = step +step_size
|
| 67 |
channel_videos_out[video_id] = channel_videos[video_id]
|
|
|
|
| 33 |
#pbar.close()
|
| 34 |
return data
|
| 35 |
|
| 36 |
+
def get_channel_video_data(self, channel_id, df_sheet, loading_bar, progress_text, item_limit=3):
|
| 37 |
"Extract all video information of the channel"
|
| 38 |
print('get video data...')
|
| 39 |
channel_videos, channel_playlists = self._get_channel_content(channel_id, limit=50)
|
|
|
|
| 61 |
duration = isodate.parse_duration(channel_videos[video_id]['duration'])
|
| 62 |
short_duration = isodate.parse_duration('PT4M')
|
| 63 |
|
| 64 |
+
if duration > short_duration and video_id not in list(df_sheet.ID):
|
| 65 |
item = item+1
|
| 66 |
step = step +step_size
|
| 67 |
channel_videos_out[video_id] = channel_videos[video_id]
|