Spaces:
Runtime error
Runtime error
Update github_ai_agent.py
Browse files- github_ai_agent.py +517 -209
github_ai_agent.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# github_ai_agent.py
|
| 2 |
|
| 3 |
import os
|
| 4 |
import re
|
|
@@ -10,14 +10,16 @@ from collections import defaultdict, Counter
|
|
| 10 |
from itertools import combinations
|
| 11 |
import numpy as np
|
| 12 |
from typing import List, Dict, Tuple, Any, Optional, Union
|
|
|
|
|
|
|
| 13 |
import google.generativeai as genai
|
| 14 |
|
| 15 |
# External libraries
|
| 16 |
from github import Github, GithubException
|
| 17 |
from sentence_transformers import SentenceTransformer
|
| 18 |
import faiss
|
| 19 |
-
from gemini_integration import GeminiClient
|
| 20 |
-
from visualization_module import RepositoryVisualizer
|
| 21 |
|
| 22 |
|
| 23 |
# Configuration
|
|
@@ -27,11 +29,13 @@ class Config:
|
|
| 27 |
self.gemini_api_key = os.environ.get("GEMINI_API_KEY")
|
| 28 |
self.github_token = os.environ.get("GITHUB_ACCESS_TOKEN")
|
| 29 |
self.embedding_model_name = "all-MiniLM-L6-v2"
|
| 30 |
-
self.gemini_model = "gemini-
|
| 31 |
self.max_files_to_load = 100 # Safety limit for large repos
|
| 32 |
self.max_token_length = 64000 # Gemini Pro context limit
|
| 33 |
self.enable_advanced_metrics = True
|
| 34 |
self.visualization_node_limit = 150
|
|
|
|
|
|
|
| 35 |
|
| 36 |
# File extensions to analyze
|
| 37 |
self.code_extensions = [
|
|
@@ -42,6 +46,7 @@ class Config:
|
|
| 42 |
'.md', '.txt', '.rst', '.html', '.xml', '.json', '.yaml', '.yml'
|
| 43 |
]
|
| 44 |
|
|
|
|
| 45 |
# GitHub Repository Management
|
| 46 |
class GitHubManager:
|
| 47 |
"""Manages interaction with GitHub repositories"""
|
|
@@ -54,6 +59,7 @@ class GitHubManager:
|
|
| 54 |
self.contributors_data = {}
|
| 55 |
self.commit_history = []
|
| 56 |
self.issues_data = []
|
|
|
|
| 57 |
|
| 58 |
def load_repository(self, repo_url: str) -> bool:
|
| 59 |
"""Load a repository from URL"""
|
|
@@ -100,8 +106,8 @@ class GitHubManager:
|
|
| 100 |
return f"{username}/{repo}"
|
| 101 |
return None
|
| 102 |
|
| 103 |
-
def load_files(self) -> Dict[str,
|
| 104 |
-
"""Load files from repository"""
|
| 105 |
if not self.current_repo:
|
| 106 |
return {}
|
| 107 |
|
|
@@ -109,43 +115,98 @@ class GitHubManager:
|
|
| 109 |
contents = self.current_repo.get_contents("")
|
| 110 |
self.file_contents = {}
|
| 111 |
files_loaded = 0
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
continue
|
| 120 |
-
|
| 121 |
# Filter by extensions
|
| 122 |
-
_, ext = os.path.splitext(
|
| 123 |
if ext not in self.config.code_extensions + self.config.doc_extensions:
|
| 124 |
continue
|
| 125 |
-
|
| 126 |
-
#
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
return self.file_contents
|
| 143 |
except Exception as e:
|
| 144 |
print(f"Error loading files: {e}")
|
| 145 |
return {}
|
| 146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
def load_contributors(self) -> List[Dict]:
|
| 148 |
-
"""Load repository contributors"""
|
| 149 |
if not self.current_repo:
|
| 150 |
return []
|
| 151 |
|
|
@@ -153,67 +214,128 @@ class GitHubManager:
|
|
| 153 |
contributors = self.current_repo.get_contributors()
|
| 154 |
self.contributors_data = {}
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
'files_modified': [],
|
| 165 |
-
'commit_messages': [],
|
| 166 |
-
'activity_dates': []
|
| 167 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
| 169 |
return list(self.contributors_data.values())
|
| 170 |
except Exception as e:
|
| 171 |
print(f"Error loading contributors: {e}")
|
| 172 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
|
| 174 |
def load_commits(self, limit: int = 100) -> List[Dict]:
|
| 175 |
-
"""Load repository commits"""
|
| 176 |
if not self.current_repo:
|
| 177 |
return []
|
| 178 |
|
| 179 |
try:
|
| 180 |
commits = self.current_repo.get_commits()[:limit]
|
| 181 |
self.commit_history = []
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
'files': []
|
| 190 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
'status': file.status
|
| 202 |
-
})
|
| 203 |
-
|
| 204 |
-
# Add this file to the contributor's file list
|
| 205 |
-
if commit.author and commit.author.login in self.contributors_data:
|
| 206 |
-
self.contributors_data[commit.author.login]['files_modified'].append(file.filename)
|
| 207 |
-
self.contributors_data[commit.author.login]['commit_messages'].append(commit.commit.message)
|
| 208 |
-
self.contributors_data[commit.author.login]['activity_dates'].append(commit.commit.author.date)
|
| 209 |
-
except:
|
| 210 |
-
# Some commits might not have accessible files
|
| 211 |
-
pass
|
| 212 |
-
|
| 213 |
-
self.commit_history.append(commit_data)
|
| 214 |
-
|
| 215 |
-
# Count frequency of modified files for each contributor
|
| 216 |
-
for login, contributor in self.contributors_data.items():
|
| 217 |
# Count occurrences of each file
|
| 218 |
file_counts = Counter(contributor['files_modified'])
|
| 219 |
# Replace list with a list of (filename, count) tuples
|
|
@@ -222,52 +344,87 @@ class GitHubManager:
|
|
| 222 |
for filename, count in file_counts.most_common(10)
|
| 223 |
]
|
| 224 |
|
| 225 |
-
return self.commit_history
|
| 226 |
-
except Exception as e:
|
| 227 |
-
print(f"Error loading commits: {e}")
|
| 228 |
-
return []
|
| 229 |
-
|
| 230 |
def load_issues(self, limit: int = 30) -> List[Dict]:
|
| 231 |
-
"""Load repository issues"""
|
| 232 |
if not self.current_repo:
|
| 233 |
return []
|
| 234 |
|
| 235 |
try:
|
| 236 |
issues = self.current_repo.get_issues(state='all')[:limit]
|
| 237 |
self.issues_data = []
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
'state': issue.state,
|
| 246 |
-
'created_at': issue.created_at,
|
| 247 |
-
'updated_at': issue.updated_at,
|
| 248 |
-
'closed_at': issue.closed_at,
|
| 249 |
-
'labels': [label.name for label in issue.labels],
|
| 250 |
-
'comments': []
|
| 251 |
}
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
issue_data
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
})
|
| 262 |
-
except:
|
| 263 |
-
pass
|
| 264 |
-
|
| 265 |
-
self.issues_data.append(issue_data)
|
| 266 |
|
| 267 |
return self.issues_data
|
| 268 |
except Exception as e:
|
| 269 |
print(f"Error loading issues: {e}")
|
| 270 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
# Knowledge Base and Vector Storage
|
| 273 |
class KnowledgeBase:
|
|
@@ -279,9 +436,11 @@ class KnowledgeBase:
|
|
| 279 |
self.index = None
|
| 280 |
self.knowledge_graph = nx.Graph()
|
| 281 |
self.insights = {}
|
|
|
|
|
|
|
| 282 |
|
| 283 |
def initialize_vector_storage(self, file_contents: Dict[str, Dict]) -> None:
|
| 284 |
-
"""Initialize vector storage with file contents"""
|
| 285 |
try:
|
| 286 |
# Clear existing data
|
| 287 |
self.embeddings = {}
|
|
@@ -291,30 +450,72 @@ class KnowledgeBase:
|
|
| 291 |
texts = []
|
| 292 |
ids = []
|
| 293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
for path, file_data in file_contents.items():
|
| 295 |
-
content = file_data['content']
|
| 296 |
-
file_type = file_data['type']
|
| 297 |
-
|
| 298 |
-
# Skip very large files to avoid embedding issues
|
| 299 |
-
if len(content) > 10000:
|
| 300 |
-
content = content[:10000] + "..."
|
| 301 |
-
|
| 302 |
-
# Store file content for embedding
|
| 303 |
-
texts.append(content)
|
| 304 |
-
ids.append(path)
|
| 305 |
-
|
| 306 |
-
# Add node to knowledge graph
|
| 307 |
self.knowledge_graph.add_node(
|
| 308 |
path,
|
| 309 |
type='file',
|
| 310 |
-
file_type=
|
| 311 |
-
size=file_data
|
| 312 |
-
extension=file_data
|
| 313 |
)
|
| 314 |
|
| 315 |
# Create embeddings for all files
|
| 316 |
if texts:
|
| 317 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
# Initialize FAISS index
|
| 320 |
dimension = file_embeddings.shape[1]
|
|
@@ -356,18 +557,24 @@ class KnowledgeBase:
|
|
| 356 |
# Create new edge
|
| 357 |
self.knowledge_graph.add_edge(login, filename, weight=count)
|
| 358 |
|
| 359 |
-
#
|
| 360 |
file_co_occurrence = defaultdict(int)
|
| 361 |
-
|
| 362 |
-
for commit
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
for
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
|
| 372 |
# Add edges for file co-occurrence
|
| 373 |
for (file1, file2), count in file_co_occurrence.items():
|
|
@@ -382,8 +589,9 @@ class KnowledgeBase:
|
|
| 382 |
print(f"Error building knowledge graph: {e}")
|
| 383 |
return nx.Graph()
|
| 384 |
|
|
|
|
| 385 |
def search_similar_files(self, query: str, top_k: int = 5) -> List[Dict]:
|
| 386 |
-
"""Search for files similar to query"""
|
| 387 |
try:
|
| 388 |
if not self.index:
|
| 389 |
return []
|
|
@@ -413,7 +621,12 @@ class KnowledgeBase:
|
|
| 413 |
return []
|
| 414 |
|
| 415 |
def extract_insights(self, repo_data: Dict, commits: List[Dict], contributors: Dict, issues: List[Dict]) -> Dict:
|
| 416 |
-
"""Extract insights from repository data"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
try:
|
| 418 |
insights = {
|
| 419 |
'basic_stats': {},
|
|
@@ -423,21 +636,44 @@ class KnowledgeBase:
|
|
| 423 |
'issues': {}
|
| 424 |
}
|
| 425 |
|
|
|
|
|
|
|
|
|
|
| 426 |
# Basic statistics
|
| 427 |
insights['basic_stats'] = {
|
| 428 |
-
'name':
|
| 429 |
-
'description':
|
| 430 |
-
'stars':
|
| 431 |
-
'forks':
|
| 432 |
-
'age_days':
|
| 433 |
-
'primary_language':
|
| 434 |
-
'topics':
|
| 435 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
|
| 437 |
# Activity insights
|
| 438 |
if commits:
|
| 439 |
-
#
|
| 440 |
-
commit_dates = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 441 |
commit_dates.sort()
|
| 442 |
|
| 443 |
if commit_dates:
|
|
@@ -452,9 +688,13 @@ class KnowledgeBase:
|
|
| 452 |
'last_commit': last_commit,
|
| 453 |
'days_span': days_span,
|
| 454 |
'commits_per_day': round(len(commits) / max(days_span, 1), 2),
|
| 455 |
-
'most_active_day': max(commit_dates, key=commit_dates.count) if commit_dates else None,
|
| 456 |
}
|
| 457 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 458 |
# Commit activity by month
|
| 459 |
commit_months = [d.strftime('%Y-%m') for d in commit_dates]
|
| 460 |
month_counts = Counter(commit_months)
|
|
@@ -537,7 +777,17 @@ class KnowledgeBase:
|
|
| 537 |
close_times = []
|
| 538 |
for issue in closed_issues:
|
| 539 |
if issue['created_at'] and issue['closed_at']:
|
| 540 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
close_times.append(close_time)
|
| 542 |
|
| 543 |
if close_times:
|
|
@@ -550,26 +800,40 @@ class KnowledgeBase:
|
|
| 550 |
{'label': label, 'count': count} for label, count in label_counts.most_common(5)
|
| 551 |
]
|
| 552 |
|
|
|
|
|
|
|
|
|
|
| 553 |
self.insights = insights
|
|
|
|
| 554 |
return insights
|
| 555 |
except Exception as e:
|
|
|
|
| 556 |
print(f"Error extracting insights: {e}")
|
|
|
|
| 557 |
return {}
|
| 558 |
|
|
|
|
| 559 |
# Main GitHub AI Agent Class
|
| 560 |
class GitHubAIAgent:
|
| 561 |
"""Main class for GitHub AI Agent"""
|
| 562 |
def __init__(self):
|
| 563 |
self.config = Config()
|
| 564 |
-
self.github_manager =
|
| 565 |
-
self.knowledge_base =
|
| 566 |
-
self.gemini_client =
|
| 567 |
-
self.visualization_manager =
|
| 568 |
|
| 569 |
self.repository_loaded = False
|
| 570 |
self.repository_url = ""
|
| 571 |
self.repository_analysis = {}
|
| 572 |
self.visualizations = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 573 |
|
| 574 |
def set_api_keys(self, gemini_api_key: str, github_token: str = None) -> None:
|
| 575 |
"""Set API keys"""
|
|
@@ -582,13 +846,14 @@ class GitHubAIAgent:
|
|
| 582 |
self.config.gemini_api_key = gemini_api_key
|
| 583 |
self.config.github_token = github_token
|
| 584 |
|
| 585 |
-
#
|
| 586 |
self.github_manager = GitHubManager(self.config)
|
| 587 |
-
self.
|
| 588 |
-
self.
|
|
|
|
| 589 |
|
| 590 |
def load_repository(self, repository_url: str) -> Dict:
|
| 591 |
-
"""Load and analyze a GitHub repository"""
|
| 592 |
result = {
|
| 593 |
'success': False,
|
| 594 |
'message': '',
|
|
@@ -604,7 +869,7 @@ class GitHubAIAgent:
|
|
| 604 |
self.repository_analysis = {}
|
| 605 |
self.visualizations = {}
|
| 606 |
|
| 607 |
-
# Load repository
|
| 608 |
print(f"Loading repository: {repository_url}")
|
| 609 |
repo_loaded = self.github_manager.load_repository(repository_url)
|
| 610 |
|
|
@@ -615,30 +880,27 @@ class GitHubAIAgent:
|
|
| 615 |
# Store repository URL
|
| 616 |
self.repository_url = repository_url
|
| 617 |
|
| 618 |
-
#
|
| 619 |
-
|
| 620 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
result['file_count'] = len(files)
|
| 622 |
-
|
| 623 |
-
# Load contributors
|
| 624 |
-
print("Loading contributors")
|
| 625 |
-
contributors = self.github_manager.load_contributors()
|
| 626 |
result['contributor_count'] = len(contributors)
|
| 627 |
|
| 628 |
-
#
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
# Load issues
|
| 633 |
-
print("Loading issues")
|
| 634 |
-
issues = self.github_manager.load_issues()
|
| 635 |
-
|
| 636 |
-
# Initialize vector storage
|
| 637 |
-
print("Building vector embeddings")
|
| 638 |
self.knowledge_base.initialize_vector_storage(files)
|
| 639 |
-
|
| 640 |
-
# Build knowledge graph
|
| 641 |
-
print("Building knowledge graph")
|
| 642 |
knowledge_graph = self.knowledge_base.build_knowledge_graph(
|
| 643 |
commits, self.github_manager.contributors_data
|
| 644 |
)
|
|
@@ -652,31 +914,43 @@ class GitHubAIAgent:
|
|
| 652 |
issues
|
| 653 |
)
|
| 654 |
|
| 655 |
-
#
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
self.
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 680 |
|
| 681 |
# Update result
|
| 682 |
result['success'] = True
|
|
@@ -687,11 +961,15 @@ class GitHubAIAgent:
|
|
| 687 |
|
| 688 |
return result
|
| 689 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
| 690 |
result['message'] = f"Error loading repository: {str(e)}"
|
| 691 |
return result
|
| 692 |
|
|
|
|
| 693 |
def answer_query(self, query: str) -> Dict:
|
| 694 |
-
"""Answer a natural language query about the repository"""
|
| 695 |
if not self.repository_loaded:
|
| 696 |
return {
|
| 697 |
'success': False,
|
|
@@ -699,6 +977,14 @@ class GitHubAIAgent:
|
|
| 699 |
'answer': ""
|
| 700 |
}
|
| 701 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 702 |
try:
|
| 703 |
# Search for relevant files
|
| 704 |
similar_files = self.knowledge_base.search_similar_files(query)
|
|
@@ -711,12 +997,21 @@ class GitHubAIAgent:
|
|
| 711 |
self.knowledge_base.insights
|
| 712 |
)
|
| 713 |
|
| 714 |
-
|
| 715 |
'success': True,
|
| 716 |
'message': "Query answered successfully",
|
| 717 |
'answer': answer,
|
| 718 |
'relevant_files': [f['file'] for f in similar_files]
|
| 719 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 720 |
except Exception as e:
|
| 721 |
return {
|
| 722 |
'success': False,
|
|
@@ -725,7 +1020,7 @@ class GitHubAIAgent:
|
|
| 725 |
}
|
| 726 |
|
| 727 |
def analyze_code(self, file_path: str = "", code_snippet: str = "", language: str = "") -> Dict:
|
| 728 |
-
"""Analyze a code file or snippet"""
|
| 729 |
if not file_path and not code_snippet:
|
| 730 |
return {
|
| 731 |
'success': False,
|
|
@@ -836,4 +1131,17 @@ class GitHubAIAgent:
|
|
| 836 |
'success': True,
|
| 837 |
'message': "Repository visualizations retrieved",
|
| 838 |
'visualizations': self.visualizations
|
| 839 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# github_ai_agent.py - Improved version with parallel processing and error handling
|
| 2 |
|
| 3 |
import os
|
| 4 |
import re
|
|
|
|
| 10 |
from itertools import combinations
|
| 11 |
import numpy as np
|
| 12 |
from typing import List, Dict, Tuple, Any, Optional, Union
|
| 13 |
+
import concurrent.futures
|
| 14 |
+
from functools import lru_cache
|
| 15 |
import google.generativeai as genai
|
| 16 |
|
| 17 |
# External libraries
|
| 18 |
from github import Github, GithubException
|
| 19 |
from sentence_transformers import SentenceTransformer
|
| 20 |
import faiss
|
| 21 |
+
from gemini_integration import GeminiClient
|
| 22 |
+
from visualization_module import RepositoryVisualizer
|
| 23 |
|
| 24 |
|
| 25 |
# Configuration
|
|
|
|
| 29 |
self.gemini_api_key = os.environ.get("GEMINI_API_KEY")
|
| 30 |
self.github_token = os.environ.get("GITHUB_ACCESS_TOKEN")
|
| 31 |
self.embedding_model_name = "all-MiniLM-L6-v2"
|
| 32 |
+
self.gemini_model = "gemini-2.0-pro-exp-02-05"
|
| 33 |
self.max_files_to_load = 100 # Safety limit for large repos
|
| 34 |
self.max_token_length = 64000 # Gemini Pro context limit
|
| 35 |
self.enable_advanced_metrics = True
|
| 36 |
self.visualization_node_limit = 150
|
| 37 |
+
self.cache_enabled = True
|
| 38 |
+
self.cache_ttl = 3600 # Cache time to live in seconds
|
| 39 |
|
| 40 |
# File extensions to analyze
|
| 41 |
self.code_extensions = [
|
|
|
|
| 46 |
'.md', '.txt', '.rst', '.html', '.xml', '.json', '.yaml', '.yml'
|
| 47 |
]
|
| 48 |
|
| 49 |
+
|
| 50 |
# GitHub Repository Management
|
| 51 |
class GitHubManager:
|
| 52 |
"""Manages interaction with GitHub repositories"""
|
|
|
|
| 59 |
self.contributors_data = {}
|
| 60 |
self.commit_history = []
|
| 61 |
self.issues_data = []
|
| 62 |
+
self.file_cache = {} # Cache for loaded files
|
| 63 |
|
| 64 |
def load_repository(self, repo_url: str) -> bool:
|
| 65 |
"""Load a repository from URL"""
|
|
|
|
| 106 |
return f"{username}/{repo}"
|
| 107 |
return None
|
| 108 |
|
| 109 |
+
def load_files(self) -> Dict[str, Dict]:
|
| 110 |
+
"""Load files from repository with improved performance"""
|
| 111 |
if not self.current_repo:
|
| 112 |
return {}
|
| 113 |
|
|
|
|
| 115 |
contents = self.current_repo.get_contents("")
|
| 116 |
self.file_contents = {}
|
| 117 |
files_loaded = 0
|
| 118 |
+
batch_size = 20 # Process files in batches
|
| 119 |
+
|
| 120 |
+
# Create a queue of files to process
|
| 121 |
+
file_queue = []
|
| 122 |
+
|
| 123 |
+
# First pass - collect all file paths
|
| 124 |
+
while contents:
|
| 125 |
+
content_item = contents.pop(0)
|
| 126 |
+
|
| 127 |
+
# Skip directories but add their contents to our processing queue
|
| 128 |
+
if content_item.type == "dir":
|
| 129 |
+
try:
|
| 130 |
+
dir_contents = self.current_repo.get_contents(content_item.path)
|
| 131 |
+
contents.extend(dir_contents)
|
| 132 |
+
except Exception as e:
|
| 133 |
+
print(f"Error accessing directory {content_item.path}: {e}")
|
| 134 |
continue
|
| 135 |
+
|
| 136 |
# Filter by extensions
|
| 137 |
+
_, ext = os.path.splitext(content_item.path)
|
| 138 |
if ext not in self.config.code_extensions + self.config.doc_extensions:
|
| 139 |
continue
|
| 140 |
+
|
| 141 |
+
# Add file to processing queue
|
| 142 |
+
file_queue.append(content_item)
|
| 143 |
+
|
| 144 |
+
# Stop if we've reached our limit
|
| 145 |
+
if len(file_queue) >= self.config.max_files_to_load:
|
| 146 |
+
break
|
| 147 |
+
|
| 148 |
+
# Process files in batches
|
| 149 |
+
for i in range(0, len(file_queue), batch_size):
|
| 150 |
+
batch = file_queue[i:i+batch_size]
|
| 151 |
+
|
| 152 |
+
# Process batch in parallel
|
| 153 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 154 |
+
future_to_file = {
|
| 155 |
+
executor.submit(self._process_file, file_content): file_content
|
| 156 |
+
for file_content in batch
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
for future in concurrent.futures.as_completed(future_to_file):
|
| 160 |
+
file_content = future_to_file[future]
|
| 161 |
+
try:
|
| 162 |
+
result = future.result()
|
| 163 |
+
if result:
|
| 164 |
+
self.file_contents[file_content.path] = result
|
| 165 |
+
files_loaded += 1
|
| 166 |
+
except Exception as e:
|
| 167 |
+
print(f"Error processing file {file_content.path}: {e}")
|
| 168 |
|
| 169 |
return self.file_contents
|
| 170 |
except Exception as e:
|
| 171 |
print(f"Error loading files: {e}")
|
| 172 |
return {}
|
| 173 |
|
| 174 |
+
def _process_file(self, file_content) -> Optional[Dict]:
|
| 175 |
+
"""Process a single file (for parallel execution)"""
|
| 176 |
+
try:
|
| 177 |
+
# Check if in cache
|
| 178 |
+
if file_content.path in self.file_cache:
|
| 179 |
+
return self.file_cache[file_content.path]
|
| 180 |
+
|
| 181 |
+
_, ext = os.path.splitext(file_content.path)
|
| 182 |
+
|
| 183 |
+
# Only process text files with specified extensions
|
| 184 |
+
if ext not in self.config.code_extensions + self.config.doc_extensions:
|
| 185 |
+
return None
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
# Decode content
|
| 189 |
+
decoded_content = file_content.decoded_content.decode('utf-8')
|
| 190 |
+
result = {
|
| 191 |
+
'content': decoded_content,
|
| 192 |
+
'type': 'code' if ext in self.config.code_extensions else 'document',
|
| 193 |
+
'size': file_content.size,
|
| 194 |
+
'ext': ext
|
| 195 |
+
}
|
| 196 |
+
|
| 197 |
+
# Update cache
|
| 198 |
+
self.file_cache[file_content.path] = result
|
| 199 |
+
return result
|
| 200 |
+
except UnicodeDecodeError:
|
| 201 |
+
# Skip binary files
|
| 202 |
+
return None
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
print(f"Error processing file {file_content.path}: {e}")
|
| 206 |
+
return None
|
| 207 |
+
|
| 208 |
def load_contributors(self) -> List[Dict]:
|
| 209 |
+
"""Load repository contributors with improved performance"""
|
| 210 |
if not self.current_repo:
|
| 211 |
return []
|
| 212 |
|
|
|
|
| 214 |
contributors = self.current_repo.get_contributors()
|
| 215 |
self.contributors_data = {}
|
| 216 |
|
| 217 |
+
# Collect basic contributor info
|
| 218 |
+
contributor_list = list(contributors) # Convert from PaginatedList to list
|
| 219 |
+
|
| 220 |
+
# Process in parallel
|
| 221 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 222 |
+
future_to_contributor = {
|
| 223 |
+
executor.submit(self._process_contributor, contributor): contributor
|
| 224 |
+
for contributor in contributor_list
|
|
|
|
|
|
|
|
|
|
| 225 |
}
|
| 226 |
+
|
| 227 |
+
for future in concurrent.futures.as_completed(future_to_contributor):
|
| 228 |
+
contributor = future_to_contributor[future]
|
| 229 |
+
try:
|
| 230 |
+
contributor_data = future.result()
|
| 231 |
+
if contributor_data:
|
| 232 |
+
self.contributors_data[contributor.login] = contributor_data
|
| 233 |
+
except Exception as e:
|
| 234 |
+
print(f"Error processing contributor {contributor.login}: {e}")
|
| 235 |
|
| 236 |
return list(self.contributors_data.values())
|
| 237 |
except Exception as e:
|
| 238 |
print(f"Error loading contributors: {e}")
|
| 239 |
return []
|
| 240 |
+
|
| 241 |
+
def _process_contributor(self, contributor) -> Dict:
|
| 242 |
+
"""Process a single contributor (for parallel execution)"""
|
| 243 |
+
try:
|
| 244 |
+
return {
|
| 245 |
+
'login': contributor.login,
|
| 246 |
+
'id': contributor.id,
|
| 247 |
+
'contributions': contributor.contributions,
|
| 248 |
+
'avatar_url': contributor.avatar_url,
|
| 249 |
+
'html_url': contributor.html_url,
|
| 250 |
+
'type': contributor.type,
|
| 251 |
+
'files_modified': [],
|
| 252 |
+
'commit_messages': [],
|
| 253 |
+
'activity_dates': []
|
| 254 |
+
}
|
| 255 |
+
except Exception as e:
|
| 256 |
+
print(f"Error processing contributor {contributor.login}: {e}")
|
| 257 |
+
return None
|
| 258 |
|
| 259 |
def load_commits(self, limit: int = 100) -> List[Dict]:
|
| 260 |
+
"""Load repository commits with improved performance"""
|
| 261 |
if not self.current_repo:
|
| 262 |
return []
|
| 263 |
|
| 264 |
try:
|
| 265 |
commits = self.current_repo.get_commits()[:limit]
|
| 266 |
self.commit_history = []
|
| 267 |
+
commits_list = list(commits) # Convert from PaginatedList to list
|
| 268 |
+
|
| 269 |
+
# Process commits in parallel
|
| 270 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 271 |
+
future_to_commit = {
|
| 272 |
+
executor.submit(self._process_commit, commit): commit
|
| 273 |
+
for commit in commits_list
|
|
|
|
| 274 |
}
|
| 275 |
+
|
| 276 |
+
for future in concurrent.futures.as_completed(future_to_commit):
|
| 277 |
+
commit = future_to_commit[future]
|
| 278 |
+
try:
|
| 279 |
+
commit_data = future.result()
|
| 280 |
+
if commit_data:
|
| 281 |
+
self.commit_history.append(commit_data)
|
| 282 |
+
except Exception as e:
|
| 283 |
+
print(f"Error processing commit {commit.sha}: {e}")
|
| 284 |
+
|
| 285 |
+
# Process contributor file statistics
|
| 286 |
+
self._update_contributor_file_stats()
|
| 287 |
+
|
| 288 |
+
return self.commit_history
|
| 289 |
+
except Exception as e:
|
| 290 |
+
print(f"Error loading commits: {e}")
|
| 291 |
+
return []
|
| 292 |
+
|
| 293 |
+
def _process_commit(self, commit) -> Optional[Dict]:
|
| 294 |
+
"""Process a single commit (for parallel execution)"""
|
| 295 |
+
try:
|
| 296 |
+
# Make sure the commit date is timezone-naive
|
| 297 |
+
commit_date = commit.commit.author.date
|
| 298 |
+
if hasattr(commit_date, 'tzinfo') and commit_date.tzinfo:
|
| 299 |
+
commit_date = commit_date.replace(tzinfo=None)
|
| 300 |
+
|
| 301 |
+
commit_data = {
|
| 302 |
+
'sha': commit.sha,
|
| 303 |
+
'author': commit.author.login if commit.author else 'Unknown',
|
| 304 |
+
'date': commit_date,
|
| 305 |
+
'message': commit.commit.message,
|
| 306 |
+
'files': []
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
# Get files changed in this commit
|
| 310 |
+
try:
|
| 311 |
+
commit_files = commit.files
|
| 312 |
+
for file in commit_files:
|
| 313 |
+
file_data = {
|
| 314 |
+
'filename': file.filename,
|
| 315 |
+
'additions': file.additions,
|
| 316 |
+
'deletions': file.deletions,
|
| 317 |
+
'changes': file.changes,
|
| 318 |
+
'status': file.status
|
| 319 |
+
}
|
| 320 |
+
commit_data['files'].append(file_data)
|
| 321 |
+
|
| 322 |
+
# Add this file to the contributor's file list
|
| 323 |
+
if commit.author and commit.author.login in self.contributors_data:
|
| 324 |
+
self.contributors_data[commit.author.login]['files_modified'].append(file.filename)
|
| 325 |
+
self.contributors_data[commit.author.login]['commit_messages'].append(commit.commit.message)
|
| 326 |
+
self.contributors_data[commit.author.login]['activity_dates'].append(commit_date)
|
| 327 |
+
except Exception as e:
|
| 328 |
+
print(f"Error processing files for commit {commit.sha}: {e}")
|
| 329 |
|
| 330 |
+
return commit_data
|
| 331 |
+
except Exception as e:
|
| 332 |
+
print(f"Error processing commit {commit.sha}: {e}")
|
| 333 |
+
return None
|
| 334 |
+
|
| 335 |
+
def _update_contributor_file_stats(self):
|
| 336 |
+
"""Update contributor file statistics"""
|
| 337 |
+
for login, contributor in self.contributors_data.items():
|
| 338 |
+
if 'files_modified' in contributor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
# Count occurrences of each file
|
| 340 |
file_counts = Counter(contributor['files_modified'])
|
| 341 |
# Replace list with a list of (filename, count) tuples
|
|
|
|
| 344 |
for filename, count in file_counts.most_common(10)
|
| 345 |
]
|
| 346 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
def load_issues(self, limit: int = 30) -> List[Dict]:
|
| 348 |
+
"""Load repository issues with improved performance"""
|
| 349 |
if not self.current_repo:
|
| 350 |
return []
|
| 351 |
|
| 352 |
try:
|
| 353 |
issues = self.current_repo.get_issues(state='all')[:limit]
|
| 354 |
self.issues_data = []
|
| 355 |
+
issues_list = list(issues) # Convert from PaginatedList to list
|
| 356 |
+
|
| 357 |
+
# Process issues in parallel
|
| 358 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 359 |
+
future_to_issue = {
|
| 360 |
+
executor.submit(self._process_issue, issue): issue
|
| 361 |
+
for issue in issues_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 362 |
}
|
| 363 |
+
|
| 364 |
+
for future in concurrent.futures.as_completed(future_to_issue):
|
| 365 |
+
issue = future_to_issue[future]
|
| 366 |
+
try:
|
| 367 |
+
issue_data = future.result()
|
| 368 |
+
if issue_data:
|
| 369 |
+
self.issues_data.append(issue_data)
|
| 370 |
+
except Exception as e:
|
| 371 |
+
print(f"Error processing issue #{issue.number}: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
return self.issues_data
|
| 374 |
except Exception as e:
|
| 375 |
print(f"Error loading issues: {e}")
|
| 376 |
return []
|
| 377 |
+
|
| 378 |
+
def _process_issue(self, issue) -> Optional[Dict]:
|
| 379 |
+
"""Process a single issue (for parallel execution)"""
|
| 380 |
+
try:
|
| 381 |
+
# Normalize datetime objects
|
| 382 |
+
created_at = issue.created_at
|
| 383 |
+
updated_at = issue.updated_at
|
| 384 |
+
closed_at = issue.closed_at
|
| 385 |
+
|
| 386 |
+
if hasattr(created_at, 'tzinfo') and created_at.tzinfo:
|
| 387 |
+
created_at = created_at.replace(tzinfo=None)
|
| 388 |
+
if hasattr(updated_at, 'tzinfo') and updated_at.tzinfo:
|
| 389 |
+
updated_at = updated_at.replace(tzinfo=None)
|
| 390 |
+
if hasattr(closed_at, 'tzinfo') and closed_at and closed_at.tzinfo:
|
| 391 |
+
closed_at = closed_at.replace(tzinfo=None)
|
| 392 |
+
|
| 393 |
+
issue_data = {
|
| 394 |
+
'number': issue.number,
|
| 395 |
+
'title': issue.title,
|
| 396 |
+
'body': issue.body,
|
| 397 |
+
'user': issue.user.login if issue.user else 'Unknown',
|
| 398 |
+
'state': issue.state,
|
| 399 |
+
'created_at': created_at,
|
| 400 |
+
'updated_at': updated_at,
|
| 401 |
+
'closed_at': closed_at,
|
| 402 |
+
'labels': [label.name for label in issue.labels],
|
| 403 |
+
'comments': []
|
| 404 |
+
}
|
| 405 |
+
|
| 406 |
+
# Get comments for this issue (limited to 10)
|
| 407 |
+
try:
|
| 408 |
+
comments = issue.get_comments()[:10]
|
| 409 |
+
for comment in comments:
|
| 410 |
+
# Normalize datetime
|
| 411 |
+
comment_created_at = comment.created_at
|
| 412 |
+
if hasattr(comment_created_at, 'tzinfo') and comment_created_at.tzinfo:
|
| 413 |
+
comment_created_at = comment_created_at.replace(tzinfo=None)
|
| 414 |
+
|
| 415 |
+
issue_data['comments'].append({
|
| 416 |
+
'user': comment.user.login if comment.user else 'Unknown',
|
| 417 |
+
'body': comment.body,
|
| 418 |
+
'created_at': comment_created_at
|
| 419 |
+
})
|
| 420 |
+
except Exception as e:
|
| 421 |
+
print(f"Error loading comments for issue #{issue.number}: {e}")
|
| 422 |
+
|
| 423 |
+
return issue_data
|
| 424 |
+
except Exception as e:
|
| 425 |
+
print(f"Error processing issue #{issue.number}: {e}")
|
| 426 |
+
return None
|
| 427 |
+
|
| 428 |
|
| 429 |
# Knowledge Base and Vector Storage
|
| 430 |
class KnowledgeBase:
|
|
|
|
| 436 |
self.index = None
|
| 437 |
self.knowledge_graph = nx.Graph()
|
| 438 |
self.insights = {}
|
| 439 |
+
self.insights_cache = {}
|
| 440 |
+
self.cache_timestamp = None
|
| 441 |
|
| 442 |
def initialize_vector_storage(self, file_contents: Dict[str, Dict]) -> None:
|
| 443 |
+
"""Initialize vector storage with file contents and batched processing"""
|
| 444 |
try:
|
| 445 |
# Clear existing data
|
| 446 |
self.embeddings = {}
|
|
|
|
| 450 |
texts = []
|
| 451 |
ids = []
|
| 452 |
|
| 453 |
+
# Process files in parallel for large repositories
|
| 454 |
+
if len(file_contents) > 50:
|
| 455 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 456 |
+
# Process files in batches
|
| 457 |
+
batch_size = 20
|
| 458 |
+
keys = list(file_contents.keys())
|
| 459 |
+
batches = [keys[i:i + batch_size] for i in range(0, len(keys), batch_size)]
|
| 460 |
+
|
| 461 |
+
# Create a function to process a batch
|
| 462 |
+
def process_batch(batch_keys):
|
| 463 |
+
batch_texts = []
|
| 464 |
+
batch_ids = []
|
| 465 |
+
for path in batch_keys:
|
| 466 |
+
file_data = file_contents[path]
|
| 467 |
+
content = file_data['content']
|
| 468 |
+
|
| 469 |
+
# Skip very large files to avoid embedding issues
|
| 470 |
+
if len(content) > 10000:
|
| 471 |
+
content = content[:10000] + "..."
|
| 472 |
+
|
| 473 |
+
batch_texts.append(content)
|
| 474 |
+
batch_ids.append(path)
|
| 475 |
+
return batch_texts, batch_ids
|
| 476 |
+
|
| 477 |
+
# Submit batch processing tasks
|
| 478 |
+
futures = [executor.submit(process_batch, batch) for batch in batches]
|
| 479 |
+
|
| 480 |
+
# Collect results
|
| 481 |
+
for future in concurrent.futures.as_completed(futures):
|
| 482 |
+
batch_texts, batch_ids = future.result()
|
| 483 |
+
texts.extend(batch_texts)
|
| 484 |
+
ids.extend(batch_ids)
|
| 485 |
+
else:
|
| 486 |
+
# For smaller repositories, process sequentially
|
| 487 |
+
for path, file_data in file_contents.items():
|
| 488 |
+
content = file_data['content']
|
| 489 |
+
|
| 490 |
+
# Skip very large files to avoid embedding issues
|
| 491 |
+
if len(content) > 10000:
|
| 492 |
+
content = content[:10000] + "..."
|
| 493 |
+
|
| 494 |
+
texts.append(content)
|
| 495 |
+
ids.append(path)
|
| 496 |
+
|
| 497 |
+
# Add nodes to knowledge graph
|
| 498 |
for path, file_data in file_contents.items():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
self.knowledge_graph.add_node(
|
| 500 |
path,
|
| 501 |
type='file',
|
| 502 |
+
file_type=file_data.get('type', 'unknown'),
|
| 503 |
+
size=file_data.get('size', 0),
|
| 504 |
+
extension=file_data.get('ext', '')
|
| 505 |
)
|
| 506 |
|
| 507 |
# Create embeddings for all files
|
| 508 |
if texts:
|
| 509 |
+
# Process embeddings in batches to avoid memory issues
|
| 510 |
+
batch_size = 32
|
| 511 |
+
file_embeddings = []
|
| 512 |
+
|
| 513 |
+
for i in range(0, len(texts), batch_size):
|
| 514 |
+
batch_texts = texts[i:i+batch_size]
|
| 515 |
+
batch_embeddings = self.embedding_model.encode(batch_texts)
|
| 516 |
+
file_embeddings.append(batch_embeddings)
|
| 517 |
+
|
| 518 |
+
file_embeddings = np.vstack(file_embeddings)
|
| 519 |
|
| 520 |
# Initialize FAISS index
|
| 521 |
dimension = file_embeddings.shape[1]
|
|
|
|
| 557 |
# Create new edge
|
| 558 |
self.knowledge_graph.add_edge(login, filename, weight=count)
|
| 559 |
|
| 560 |
+
# Optimized co-occurrence calculation
|
| 561 |
file_co_occurrence = defaultdict(int)
|
| 562 |
+
|
| 563 |
+
# Process in batches for large commit histories
|
| 564 |
+
batch_size = 50
|
| 565 |
+
for i in range(0, len(commits), batch_size):
|
| 566 |
+
batch_commits = commits[i:i+batch_size]
|
| 567 |
+
|
| 568 |
+
for commit in batch_commits:
|
| 569 |
+
# Get all files in this commit
|
| 570 |
+
commit_files = [file['filename'] for file in commit['files']]
|
| 571 |
+
|
| 572 |
+
# Add co-occurrence for each pair of files
|
| 573 |
+
from itertools import combinations
|
| 574 |
+
for file1, file2 in combinations(commit_files, 2):
|
| 575 |
+
if file1 in self.knowledge_graph and file2 in self.knowledge_graph:
|
| 576 |
+
file_pair = tuple(sorted([file1, file2]))
|
| 577 |
+
file_co_occurrence[file_pair] += 1
|
| 578 |
|
| 579 |
# Add edges for file co-occurrence
|
| 580 |
for (file1, file2), count in file_co_occurrence.items():
|
|
|
|
| 589 |
print(f"Error building knowledge graph: {e}")
|
| 590 |
return nx.Graph()
|
| 591 |
|
| 592 |
+
@lru_cache(maxsize=32)
|
| 593 |
def search_similar_files(self, query: str, top_k: int = 5) -> List[Dict]:
|
| 594 |
+
"""Search for files similar to query with caching"""
|
| 595 |
try:
|
| 596 |
if not self.index:
|
| 597 |
return []
|
|
|
|
| 621 |
return []
|
| 622 |
|
| 623 |
def extract_insights(self, repo_data: Dict, commits: List[Dict], contributors: Dict, issues: List[Dict]) -> Dict:
|
| 624 |
+
"""Extract insights from repository data with datetime fix and caching"""
|
| 625 |
+
# Check if we have a recent cache (less than 10 minutes old)
|
| 626 |
+
current_time = time.time()
|
| 627 |
+
if self.cache_timestamp and (current_time - self.cache_timestamp < 600) and self.insights_cache:
|
| 628 |
+
return self.insights_cache
|
| 629 |
+
|
| 630 |
try:
|
| 631 |
insights = {
|
| 632 |
'basic_stats': {},
|
|
|
|
| 636 |
'issues': {}
|
| 637 |
}
|
| 638 |
|
| 639 |
+
# Make a deep copy of repo_data to avoid modifying the original
|
| 640 |
+
repo_data_copy = {k: v for k, v in repo_data.items()}
|
| 641 |
+
|
| 642 |
# Basic statistics
|
| 643 |
insights['basic_stats'] = {
|
| 644 |
+
'name': repo_data_copy['name'],
|
| 645 |
+
'description': repo_data_copy['description'],
|
| 646 |
+
'stars': repo_data_copy['stars'],
|
| 647 |
+
'forks': repo_data_copy['forks'],
|
| 648 |
+
'age_days': None, # Will calculate below
|
| 649 |
+
'primary_language': repo_data_copy['language'],
|
| 650 |
+
'topics': repo_data_copy['topics']
|
| 651 |
}
|
| 652 |
+
|
| 653 |
+
# Fix: Normalize datetime objects to be timezone-naive for consistent comparison
|
| 654 |
+
created_at = repo_data_copy.get('created_at')
|
| 655 |
+
if created_at:
|
| 656 |
+
# Remove timezone info if present
|
| 657 |
+
if hasattr(created_at, 'tzinfo') and created_at.tzinfo:
|
| 658 |
+
created_at = created_at.replace(tzinfo=None)
|
| 659 |
+
|
| 660 |
+
# Calculate age
|
| 661 |
+
now = datetime.datetime.now()
|
| 662 |
+
insights['basic_stats']['age_days'] = (now - created_at).days
|
| 663 |
|
| 664 |
# Activity insights
|
| 665 |
if commits:
|
| 666 |
+
# Fix: Normalize all datetime objects to be timezone-naive
|
| 667 |
+
commit_dates = []
|
| 668 |
+
for commit in commits:
|
| 669 |
+
date = commit.get('date')
|
| 670 |
+
if date:
|
| 671 |
+
# Remove timezone info if present
|
| 672 |
+
if hasattr(date, 'tzinfo') and date.tzinfo:
|
| 673 |
+
date = date.replace(tzinfo=None)
|
| 674 |
+
commit_dates.append(date)
|
| 675 |
+
|
| 676 |
+
# Sort dates
|
| 677 |
commit_dates.sort()
|
| 678 |
|
| 679 |
if commit_dates:
|
|
|
|
| 688 |
'last_commit': last_commit,
|
| 689 |
'days_span': days_span,
|
| 690 |
'commits_per_day': round(len(commits) / max(days_span, 1), 2),
|
|
|
|
| 691 |
}
|
| 692 |
|
| 693 |
+
# Fix: Use Counter for most active day calculation
|
| 694 |
+
date_counter = Counter(d.date() for d in commit_dates)
|
| 695 |
+
if date_counter:
|
| 696 |
+
insights['activity']['most_active_day'] = date_counter.most_common(1)[0][0]
|
| 697 |
+
|
| 698 |
# Commit activity by month
|
| 699 |
commit_months = [d.strftime('%Y-%m') for d in commit_dates]
|
| 700 |
month_counts = Counter(commit_months)
|
|
|
|
| 777 |
close_times = []
|
| 778 |
for issue in closed_issues:
|
| 779 |
if issue['created_at'] and issue['closed_at']:
|
| 780 |
+
# Fix: Normalize datetime objects to be timezone-naive
|
| 781 |
+
created_at = issue['created_at']
|
| 782 |
+
closed_at = issue['closed_at']
|
| 783 |
+
|
| 784 |
+
if hasattr(created_at, 'tzinfo') and created_at.tzinfo:
|
| 785 |
+
created_at = created_at.replace(tzinfo=None)
|
| 786 |
+
|
| 787 |
+
if hasattr(closed_at, 'tzinfo') and closed_at.tzinfo:
|
| 788 |
+
closed_at = closed_at.replace(tzinfo=None)
|
| 789 |
+
|
| 790 |
+
close_time = (closed_at - created_at).days
|
| 791 |
close_times.append(close_time)
|
| 792 |
|
| 793 |
if close_times:
|
|
|
|
| 800 |
{'label': label, 'count': count} for label, count in label_counts.most_common(5)
|
| 801 |
]
|
| 802 |
|
| 803 |
+
# Update cache
|
| 804 |
+
self.insights_cache = insights
|
| 805 |
+
self.cache_timestamp = current_time
|
| 806 |
self.insights = insights
|
| 807 |
+
|
| 808 |
return insights
|
| 809 |
except Exception as e:
|
| 810 |
+
import traceback
|
| 811 |
print(f"Error extracting insights: {e}")
|
| 812 |
+
print(traceback.format_exc())
|
| 813 |
return {}
|
| 814 |
|
| 815 |
+
|
| 816 |
# Main GitHub AI Agent Class
|
| 817 |
class GitHubAIAgent:
|
| 818 |
"""Main class for GitHub AI Agent"""
|
| 819 |
def __init__(self):
|
| 820 |
self.config = Config()
|
| 821 |
+
self.github_manager = None
|
| 822 |
+
self.knowledge_base = None
|
| 823 |
+
self.gemini_client = None
|
| 824 |
+
self.visualization_manager = None
|
| 825 |
|
| 826 |
self.repository_loaded = False
|
| 827 |
self.repository_url = ""
|
| 828 |
self.repository_analysis = {}
|
| 829 |
self.visualizations = {}
|
| 830 |
+
|
| 831 |
+
# Initialize caches
|
| 832 |
+
self.file_cache = {}
|
| 833 |
+
self.contributor_cache = {}
|
| 834 |
+
self.commit_cache = {}
|
| 835 |
+
self.issue_cache = {}
|
| 836 |
+
self.query_cache = {}
|
| 837 |
|
| 838 |
def set_api_keys(self, gemini_api_key: str, github_token: str = None) -> None:
|
| 839 |
"""Set API keys"""
|
|
|
|
| 846 |
self.config.gemini_api_key = gemini_api_key
|
| 847 |
self.config.github_token = github_token
|
| 848 |
|
| 849 |
+
# Initialize clients
|
| 850 |
self.github_manager = GitHubManager(self.config)
|
| 851 |
+
self.knowledge_base = KnowledgeBase(self.config)
|
| 852 |
+
self.gemini_client = GeminiClient(self.config.gemini_api_key, self.config.gemini_model)
|
| 853 |
+
self.visualization_manager = RepositoryVisualizer(self.config)
|
| 854 |
|
| 855 |
def load_repository(self, repository_url: str) -> Dict:
|
| 856 |
+
"""Load and analyze a GitHub repository with improved parallelization"""
|
| 857 |
result = {
|
| 858 |
'success': False,
|
| 859 |
'message': '',
|
|
|
|
| 869 |
self.repository_analysis = {}
|
| 870 |
self.visualizations = {}
|
| 871 |
|
| 872 |
+
# Load repository basic info
|
| 873 |
print(f"Loading repository: {repository_url}")
|
| 874 |
repo_loaded = self.github_manager.load_repository(repository_url)
|
| 875 |
|
|
|
|
| 880 |
# Store repository URL
|
| 881 |
self.repository_url = repository_url
|
| 882 |
|
| 883 |
+
# Use parallel processing for loading repository data
|
| 884 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 885 |
+
# Submit tasks
|
| 886 |
+
files_future = executor.submit(self.github_manager.load_files)
|
| 887 |
+
contributors_future = executor.submit(self.github_manager.load_contributors)
|
| 888 |
+
commits_future = executor.submit(self.github_manager.load_commits)
|
| 889 |
+
issues_future = executor.submit(self.github_manager.load_issues)
|
| 890 |
+
|
| 891 |
+
# Get results
|
| 892 |
+
files = files_future.result()
|
| 893 |
+
contributors = contributors_future.result()
|
| 894 |
+
commits = commits_future.result()
|
| 895 |
+
issues = issues_future.result()
|
| 896 |
+
|
| 897 |
result['file_count'] = len(files)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 898 |
result['contributor_count'] = len(contributors)
|
| 899 |
|
| 900 |
+
# Initialize vector storage and build knowledge graph
|
| 901 |
+
# (These are kept sequential as they depend on previous steps)
|
| 902 |
+
print("Building knowledge base")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 903 |
self.knowledge_base.initialize_vector_storage(files)
|
|
|
|
|
|
|
|
|
|
| 904 |
knowledge_graph = self.knowledge_base.build_knowledge_graph(
|
| 905 |
commits, self.github_manager.contributors_data
|
| 906 |
)
|
|
|
|
| 914 |
issues
|
| 915 |
)
|
| 916 |
|
| 917 |
+
# Use a separate thread for Gemini analysis which can be slower
|
| 918 |
+
# and doesn't block the main thread
|
| 919 |
+
def analyze_with_gemini():
|
| 920 |
+
print("Analyzing repository with Gemini")
|
| 921 |
+
return self.gemini_client.analyze_repository(
|
| 922 |
+
self.github_manager.repo_data,
|
| 923 |
+
files,
|
| 924 |
+
commits,
|
| 925 |
+
self.github_manager.contributors_data,
|
| 926 |
+
insights
|
| 927 |
+
)
|
| 928 |
+
|
| 929 |
+
# Use another thread pool for visualization generation
|
| 930 |
+
def create_visualizations():
|
| 931 |
+
print("Creating repository visualizations")
|
| 932 |
+
repo_graph_path = self.visualization_manager.create_repository_graph(knowledge_graph)
|
| 933 |
+
activity_chart_path = self.visualization_manager.create_commit_activity_chart(commits)
|
| 934 |
+
contributor_network_path = self.visualization_manager.create_contributor_network(
|
| 935 |
+
self.github_manager.contributors_data, commits
|
| 936 |
+
)
|
| 937 |
+
dependency_graph_path = self.visualization_manager.create_file_dependency_graph(files)
|
| 938 |
+
|
| 939 |
+
return {
|
| 940 |
+
'repository_graph': repo_graph_path,
|
| 941 |
+
'activity_chart': activity_chart_path,
|
| 942 |
+
'contributor_network': contributor_network_path,
|
| 943 |
+
'dependency_graph': dependency_graph_path,
|
| 944 |
+
}
|
| 945 |
+
|
| 946 |
+
# Run Gemini analysis and visualization generation in parallel
|
| 947 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 948 |
+
analysis_future = executor.submit(analyze_with_gemini)
|
| 949 |
+
viz_future = executor.submit(create_visualizations)
|
| 950 |
+
|
| 951 |
+
# Get results
|
| 952 |
+
self.repository_analysis = analysis_future.result()
|
| 953 |
+
self.visualizations = viz_future.result()
|
| 954 |
|
| 955 |
# Update result
|
| 956 |
result['success'] = True
|
|
|
|
| 961 |
|
| 962 |
return result
|
| 963 |
except Exception as e:
|
| 964 |
+
import traceback
|
| 965 |
+
print(f"Error loading repository: {str(e)}")
|
| 966 |
+
print(traceback.format_exc())
|
| 967 |
result['message'] = f"Error loading repository: {str(e)}"
|
| 968 |
return result
|
| 969 |
|
| 970 |
+
@lru_cache(maxsize=32)
|
| 971 |
def answer_query(self, query: str) -> Dict:
|
| 972 |
+
"""Answer a natural language query about the repository with caching"""
|
| 973 |
if not self.repository_loaded:
|
| 974 |
return {
|
| 975 |
'success': False,
|
|
|
|
| 977 |
'answer': ""
|
| 978 |
}
|
| 979 |
|
| 980 |
+
# Check cache if enabled
|
| 981 |
+
cache_key = f"query_{hash(query)}"
|
| 982 |
+
if self.config.cache_enabled and cache_key in self.query_cache:
|
| 983 |
+
cached_result = self.query_cache[cache_key]
|
| 984 |
+
# Check if cache is still valid
|
| 985 |
+
if time.time() - cached_result['timestamp'] < self.config.cache_ttl:
|
| 986 |
+
return cached_result['result']
|
| 987 |
+
|
| 988 |
try:
|
| 989 |
# Search for relevant files
|
| 990 |
similar_files = self.knowledge_base.search_similar_files(query)
|
|
|
|
| 997 |
self.knowledge_base.insights
|
| 998 |
)
|
| 999 |
|
| 1000 |
+
result = {
|
| 1001 |
'success': True,
|
| 1002 |
'message': "Query answered successfully",
|
| 1003 |
'answer': answer,
|
| 1004 |
'relevant_files': [f['file'] for f in similar_files]
|
| 1005 |
}
|
| 1006 |
+
|
| 1007 |
+
# Update cache
|
| 1008 |
+
if self.config.cache_enabled:
|
| 1009 |
+
self.query_cache[cache_key] = {
|
| 1010 |
+
'result': result,
|
| 1011 |
+
'timestamp': time.time()
|
| 1012 |
+
}
|
| 1013 |
+
|
| 1014 |
+
return result
|
| 1015 |
except Exception as e:
|
| 1016 |
return {
|
| 1017 |
'success': False,
|
|
|
|
| 1020 |
}
|
| 1021 |
|
| 1022 |
def analyze_code(self, file_path: str = "", code_snippet: str = "", language: str = "") -> Dict:
|
| 1023 |
+
"""Analyze a code file or snippet with improved error handling"""
|
| 1024 |
if not file_path and not code_snippet:
|
| 1025 |
return {
|
| 1026 |
'success': False,
|
|
|
|
| 1131 |
'success': True,
|
| 1132 |
'message': "Repository visualizations retrieved",
|
| 1133 |
'visualizations': self.visualizations
|
| 1134 |
+
}
|
| 1135 |
+
|
| 1136 |
+
def clear_caches(self) -> None:
|
| 1137 |
+
"""Clear all caches"""
|
| 1138 |
+
self.file_cache.clear()
|
| 1139 |
+
self.contributor_cache.clear()
|
| 1140 |
+
self.commit_cache.clear()
|
| 1141 |
+
self.issue_cache.clear()
|
| 1142 |
+
self.query_cache.clear()
|
| 1143 |
+
|
| 1144 |
+
# Clear LRU caches
|
| 1145 |
+
self.answer_query.cache_clear()
|
| 1146 |
+
if hasattr(self.knowledge_base, 'search_similar_files'):
|
| 1147 |
+
self.knowledge_base.search_similar_files.cache_clear()
|