ai-deadlines / .github /scripts /update_conferences_new.py
nielsr's picture
nielsr HF Staff
Refactor conference data
f3cc2d7
raw
history blame
12.1 kB
import yaml
import requests
import os
import re
from datetime import datetime
from typing import Dict, List, Any
def fetch_conference_files() -> List[Dict[str, Any]]:
"""Fetch all conference YAML files from ccfddl repository."""
# First get the directory listing from GitHub API
api_url = "https://api.github.com/repos/ccfddl/ccf-deadlines/contents/conference/AI"
response = requests.get(api_url)
files = response.json()
conferences = []
for file in files:
if file['name'].endswith('.yml'):
yaml_content = requests.get(file['download_url']).text
conf_data = yaml.safe_load(yaml_content)
# The data is a list with a single item
if isinstance(conf_data, list) and len(conf_data) > 0:
conferences.append(conf_data[0])
return conferences
def parse_date_range(date_str: str, year: str) -> tuple[str, str]:
"""Parse various date formats and return start and end dates."""
# Remove the year if it appears at the end of the string
date_str = date_str.replace(f", {year}", "")
# Handle various date formats
try:
# Split into start and end dates
if ' - ' in date_str:
start, end = date_str.split(' - ')
elif '-' in date_str:
start, end = date_str.split('-')
else:
# For single date format like "May 19, 2025"
start = end = date_str
# Clean up month abbreviations
month_map = {
'Sept': 'September', # Handle Sept before Sep
'Jan': 'January',
'Feb': 'February',
'Mar': 'March',
'Apr': 'April',
'Jun': 'June',
'Jul': 'July',
'Aug': 'August',
'Sep': 'September',
'Oct': 'October',
'Nov': 'November',
'Dec': 'December'
}
# Create a set of all month names (full and abbreviated)
all_months = set(month_map.keys()) | set(month_map.values())
# Handle cases like "April 29-May 4"
has_month = any(month in end for month in all_months)
if not has_month:
# End is just a day number, use start's month
start_parts = start.split()
if len(start_parts) >= 1:
end = f"{start_parts[0]} {end.strip()}"
# Replace month abbreviations
for abbr, full in month_map.items():
start = start.replace(abbr, full)
end = end.replace(abbr, full)
# Clean up any extra spaces
start = ' '.join(start.split())
end = ' '.join(end.split())
# Parse start date
start_date = datetime.strptime(f"{start}, {year}", "%B %d, %Y")
# Parse end date
end_date = datetime.strptime(f"{end}, {year}", "%B %d, %Y")
return start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d')
except Exception as e:
raise ValueError(f"Could not parse date: {date_str} ({e})")
def transform_conference_data(conferences: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""Transform ccfddl format to our format."""
transformed = []
current_year = datetime.now().year
for conf in conferences:
# Get the most recent or upcoming conference instance
recent_conf = None
if 'confs' in conf:
for instance in conf['confs']:
if instance['year'] >= current_year:
recent_conf = instance
break
if not recent_conf:
continue
# Transform to our format
transformed_conf = {
'title': conf.get('title', ''),
'year': recent_conf['year'],
'id': recent_conf['id'],
'full_name': conf.get('description', ''),
'link': recent_conf.get('link', ''),
'deadline': recent_conf.get('timeline', [{}])[0].get('deadline', ''),
'timezone': recent_conf.get('timezone', ''),
'date': recent_conf.get('date', ''),
'tags': [], # We'll need to maintain a mapping for tags
}
# Handle city and country fields instead of place
place = recent_conf.get('place', '')
if place:
# Try to parse the place into city and country if it contains a comma
if ',' in place:
city, country = place.split(',', 1)
transformed_conf['city'] = city.strip()
transformed_conf['country'] = country.strip()
else:
# If we can't parse, just set the country
transformed_conf['country'] = place.strip()
# Add optional fields
timeline = recent_conf.get('timeline', [{}])[0]
if 'abstract_deadline' in timeline:
transformed_conf['abstract_deadline'] = timeline['abstract_deadline']
# Parse date range for start/end
try:
if transformed_conf['date']:
start_date, end_date = parse_date_range(
transformed_conf['date'],
str(transformed_conf['year'])
)
transformed_conf['start'] = start_date
transformed_conf['end'] = end_date
except Exception as e:
print(f"Warning: Could not parse date for {transformed_conf['title']}: {e}")
# Add rankings as separate field
if 'rank' in conf:
rankings = []
for rank_type, rank_value in conf['rank'].items():
rankings.append(f"{rank_type.upper()}: {rank_value}")
if rankings:
transformed_conf['rankings'] = ', '.join(rankings)
transformed.append(transformed_conf)
return transformed
def load_all_current_conferences() -> Dict[str, List[Dict[str, Any]]]:
"""Load all current conferences from individual files."""
conferences_dir = 'src/data/conferences'
conference_groups = {}
if not os.path.exists(conferences_dir):
return {}
for filename in os.listdir(conferences_dir):
if filename.endswith('.yml'):
filepath = os.path.join(conferences_dir, filename)
with open(filepath, 'r') as f:
conferences = yaml.safe_load(f)
if conferences:
# Extract conference title from the first entry
title = conferences[0]['title']
conference_groups[title] = conferences
return conference_groups
def create_filename_from_title(title: str) -> str:
"""Create a filename-safe version of the conference title."""
filename = re.sub(r'[^a-zA-Z0-9\s&()-]', '', title.lower())
filename = re.sub(r'\s+', '_', filename)
filename = filename.replace('&', 'and')
filename = filename.strip('_')
return filename
def update_conference_loader():
"""Update the conference loader file with all current conferences."""
conferences_dir = 'src/data/conferences'
loader_path = 'src/utils/conferenceLoader.ts'
# Get all conference files
conference_files = []
if os.path.exists(conferences_dir):
for filename in sorted(os.listdir(conferences_dir)):
if filename.endswith('.yml'):
conference_files.append(filename)
# Generate import statements
imports = []
variable_names = []
for filename in conference_files:
# Create variable name from filename
var_name = filename.replace('.yml', '').replace('-', '_') + 'Data'
variable_names.append(var_name)
imports.append(f"import {var_name} from '@/data/conferences/{filename}';")
# Generate the loader file content
loader_content = f"""import {{ Conference }} from '@/types/conference';
// Import all conference YAML files
{chr(10).join(imports)}
// Combine all conference data into a single array
const allConferencesData: Conference[] = [
{chr(10).join(f' ...{var_name},' for var_name in variable_names)}
];
export default allConferencesData;"""
# Write the loader file
with open(loader_path, 'w') as f:
f.write(loader_content)
print(f"Updated conference loader with {len(conference_files)} conference files")
def main():
try:
# Load current conferences from individual files
current_conference_groups = load_all_current_conferences()
# Fetch and transform new data
new_conferences = fetch_conference_files()
if not new_conferences:
print("Warning: No conferences fetched from ccfddl")
return
transformed_conferences = transform_conference_data(new_conferences)
if not transformed_conferences:
print("Warning: No conferences transformed")
return
# Create conferences directory if it doesn't exist
conferences_dir = 'src/data/conferences'
os.makedirs(conferences_dir, exist_ok=True)
# Group new conferences by title
new_conference_groups = {}
for conf in transformed_conferences:
title = conf['title']
if title not in new_conference_groups:
new_conference_groups[title] = []
new_conference_groups[title].append(conf)
# Update each conference group
updated_count = 0
for title, new_confs in new_conference_groups.items():
filename = create_filename_from_title(title) + '.yml'
filepath = os.path.join(conferences_dir, filename)
# Get current conferences for this title
current_confs = current_conference_groups.get(title, [])
current_conf_dict = {conf['id']: conf for conf in current_confs}
# Update or add new conferences
for new_conf in new_confs:
if new_conf['id'] in current_conf_dict:
# Update existing conference while preserving fields
curr_conf = current_conf_dict[new_conf['id']]
# Preserve existing fields
preserved_fields = [
'tags', 'venue', 'hindex', 'submission_deadline',
'timezone_submission', 'rebuttal_period_start',
'rebuttal_period_end', 'final_decision_date',
'review_release_date', 'commitment_deadline',
'start', 'end', 'note', 'city', 'country', 'deadlines'
]
for field in preserved_fields:
if field in curr_conf:
new_conf[field] = curr_conf[field]
# Preserve existing rankings if available
if 'rankings' in curr_conf:
new_conf['rankings'] = curr_conf['rankings']
current_conf_dict[new_conf['id']] = new_conf
else:
# Add new conference
current_conf_dict[new_conf['id']] = new_conf
# Convert back to list and sort by year
all_confs = list(current_conf_dict.values())
all_confs.sort(key=lambda x: x.get('year', 9999))
# Write to individual file
with open(filepath, 'w') as f:
yaml.dump(all_confs, f, default_flow_style=False, sort_keys=False, allow_unicode=True)
updated_count += 1
print(f"Updated {filename} with {len(all_confs)} entries")
# Update the conference loader
update_conference_loader()
print(f"Successfully updated {updated_count} conference files")
except Exception as e:
print(f"Error: {e}")
raise
if __name__ == "__main__":
main()