nihalaninihal commited on
Commit
7da0953
·
verified ·
1 Parent(s): b4986a9

Update github_ai_agent.py

Browse files
Files changed (1) hide show
  1. github_ai_agent.py +517 -209
github_ai_agent.py CHANGED
@@ -1,4 +1,4 @@
1
- # github_ai_agent.py
2
 
3
  import os
4
  import re
@@ -10,14 +10,16 @@ from collections import defaultdict, Counter
10
  from itertools import combinations
11
  import numpy as np
12
  from typing import List, Dict, Tuple, Any, Optional, Union
 
 
13
  import google.generativeai as genai
14
 
15
  # External libraries
16
  from github import Github, GithubException
17
  from sentence_transformers import SentenceTransformer
18
  import faiss
19
- from gemini_integration import GeminiClient # Import GeminiClient
20
- from visualization_module import RepositoryVisualizer # Import RepositoryVisualizer
21
 
22
 
23
  # Configuration
@@ -27,11 +29,13 @@ class Config:
27
  self.gemini_api_key = os.environ.get("GEMINI_API_KEY")
28
  self.github_token = os.environ.get("GITHUB_ACCESS_TOKEN")
29
  self.embedding_model_name = "all-MiniLM-L6-v2"
30
- self.gemini_model = "gemini-1.5-pro"
31
  self.max_files_to_load = 100 # Safety limit for large repos
32
  self.max_token_length = 64000 # Gemini Pro context limit
33
  self.enable_advanced_metrics = True
34
  self.visualization_node_limit = 150
 
 
35
 
36
  # File extensions to analyze
37
  self.code_extensions = [
@@ -42,6 +46,7 @@ class Config:
42
  '.md', '.txt', '.rst', '.html', '.xml', '.json', '.yaml', '.yml'
43
  ]
44
 
 
45
  # GitHub Repository Management
46
  class GitHubManager:
47
  """Manages interaction with GitHub repositories"""
@@ -54,6 +59,7 @@ class GitHubManager:
54
  self.contributors_data = {}
55
  self.commit_history = []
56
  self.issues_data = []
 
57
 
58
  def load_repository(self, repo_url: str) -> bool:
59
  """Load a repository from URL"""
@@ -100,8 +106,8 @@ class GitHubManager:
100
  return f"{username}/{repo}"
101
  return None
102
 
103
- def load_files(self) -> Dict[str, str]:
104
- """Load files from repository"""
105
  if not self.current_repo:
106
  return {}
107
 
@@ -109,43 +115,98 @@ class GitHubManager:
109
  contents = self.current_repo.get_contents("")
110
  self.file_contents = {}
111
  files_loaded = 0
112
-
113
- while contents and files_loaded < self.config.max_files_to_load:
114
- file_content = contents.pop(0)
115
-
116
- # Skip directories but process their contents
117
- if file_content.type == "dir":
118
- contents.extend(self.current_repo.get_contents(file_content.path))
 
 
 
 
 
 
 
 
 
119
  continue
120
-
121
  # Filter by extensions
122
- _, ext = os.path.splitext(file_content.path)
123
  if ext not in self.config.code_extensions + self.config.doc_extensions:
124
  continue
125
-
126
- # Load file content
127
- try:
128
- # Handle binary files (images, etc.)
129
- if ext in self.config.code_extensions + self.config.doc_extensions:
130
- decoded_content = file_content.decoded_content.decode('utf-8')
131
- self.file_contents[file_content.path] = {
132
- 'content': decoded_content,
133
- 'type': 'code' if ext in self.config.code_extensions else 'document',
134
- 'size': file_content.size,
135
- 'ext': ext
136
- }
137
- files_loaded += 1
138
- except UnicodeDecodeError:
139
- # Skip binary files that can't be decoded as text
140
- pass
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  return self.file_contents
143
  except Exception as e:
144
  print(f"Error loading files: {e}")
145
  return {}
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  def load_contributors(self) -> List[Dict]:
148
- """Load repository contributors"""
149
  if not self.current_repo:
150
  return []
151
 
@@ -153,67 +214,128 @@ class GitHubManager:
153
  contributors = self.current_repo.get_contributors()
154
  self.contributors_data = {}
155
 
156
- for contributor in contributors:
157
- self.contributors_data[contributor.login] = {
158
- 'login': contributor.login,
159
- 'id': contributor.id,
160
- 'contributions': contributor.contributions,
161
- 'avatar_url': contributor.avatar_url,
162
- 'html_url': contributor.html_url,
163
- 'type': contributor.type,
164
- 'files_modified': [],
165
- 'commit_messages': [],
166
- 'activity_dates': []
167
  }
 
 
 
 
 
 
 
 
 
168
 
169
  return list(self.contributors_data.values())
170
  except Exception as e:
171
  print(f"Error loading contributors: {e}")
172
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
 
174
  def load_commits(self, limit: int = 100) -> List[Dict]:
175
- """Load repository commits"""
176
  if not self.current_repo:
177
  return []
178
 
179
  try:
180
  commits = self.current_repo.get_commits()[:limit]
181
  self.commit_history = []
182
-
183
- for commit in commits:
184
- commit_data = {
185
- 'sha': commit.sha,
186
- 'author': commit.author.login if commit.author else 'Unknown',
187
- 'date': commit.commit.author.date,
188
- 'message': commit.commit.message,
189
- 'files': []
190
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
- # Get files changed in this commit
193
- try:
194
- commit_files = commit.files
195
- for file in commit_files:
196
- commit_data['files'].append({
197
- 'filename': file.filename,
198
- 'additions': file.additions,
199
- 'deletions': file.deletions,
200
- 'changes': file.changes,
201
- 'status': file.status
202
- })
203
-
204
- # Add this file to the contributor's file list
205
- if commit.author and commit.author.login in self.contributors_data:
206
- self.contributors_data[commit.author.login]['files_modified'].append(file.filename)
207
- self.contributors_data[commit.author.login]['commit_messages'].append(commit.commit.message)
208
- self.contributors_data[commit.author.login]['activity_dates'].append(commit.commit.author.date)
209
- except:
210
- # Some commits might not have accessible files
211
- pass
212
-
213
- self.commit_history.append(commit_data)
214
-
215
- # Count frequency of modified files for each contributor
216
- for login, contributor in self.contributors_data.items():
217
  # Count occurrences of each file
218
  file_counts = Counter(contributor['files_modified'])
219
  # Replace list with a list of (filename, count) tuples
@@ -222,52 +344,87 @@ class GitHubManager:
222
  for filename, count in file_counts.most_common(10)
223
  ]
224
 
225
- return self.commit_history
226
- except Exception as e:
227
- print(f"Error loading commits: {e}")
228
- return []
229
-
230
  def load_issues(self, limit: int = 30) -> List[Dict]:
231
- """Load repository issues"""
232
  if not self.current_repo:
233
  return []
234
 
235
  try:
236
  issues = self.current_repo.get_issues(state='all')[:limit]
237
  self.issues_data = []
238
-
239
- for issue in issues:
240
- issue_data = {
241
- 'number': issue.number,
242
- 'title': issue.title,
243
- 'body': issue.body,
244
- 'user': issue.user.login if issue.user else 'Unknown',
245
- 'state': issue.state,
246
- 'created_at': issue.created_at,
247
- 'updated_at': issue.updated_at,
248
- 'closed_at': issue.closed_at,
249
- 'labels': [label.name for label in issue.labels],
250
- 'comments': []
251
  }
252
-
253
- # Get comments for this issue (limited to 10)
254
- try:
255
- comments = issue.get_comments()[:10]
256
- for comment in comments:
257
- issue_data['comments'].append({
258
- 'user': comment.user.login if comment.user else 'Unknown',
259
- 'body': comment.body,
260
- 'created_at': comment.created_at
261
- })
262
- except:
263
- pass
264
-
265
- self.issues_data.append(issue_data)
266
 
267
  return self.issues_data
268
  except Exception as e:
269
  print(f"Error loading issues: {e}")
270
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
  # Knowledge Base and Vector Storage
273
  class KnowledgeBase:
@@ -279,9 +436,11 @@ class KnowledgeBase:
279
  self.index = None
280
  self.knowledge_graph = nx.Graph()
281
  self.insights = {}
 
 
282
 
283
  def initialize_vector_storage(self, file_contents: Dict[str, Dict]) -> None:
284
- """Initialize vector storage with file contents"""
285
  try:
286
  # Clear existing data
287
  self.embeddings = {}
@@ -291,30 +450,72 @@ class KnowledgeBase:
291
  texts = []
292
  ids = []
293
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
  for path, file_data in file_contents.items():
295
- content = file_data['content']
296
- file_type = file_data['type']
297
-
298
- # Skip very large files to avoid embedding issues
299
- if len(content) > 10000:
300
- content = content[:10000] + "..."
301
-
302
- # Store file content for embedding
303
- texts.append(content)
304
- ids.append(path)
305
-
306
- # Add node to knowledge graph
307
  self.knowledge_graph.add_node(
308
  path,
309
  type='file',
310
- file_type=file_type,
311
- size=file_data['size'],
312
- extension=file_data['ext']
313
  )
314
 
315
  # Create embeddings for all files
316
  if texts:
317
- file_embeddings = self.embedding_model.encode(texts)
 
 
 
 
 
 
 
 
 
318
 
319
  # Initialize FAISS index
320
  dimension = file_embeddings.shape[1]
@@ -356,18 +557,24 @@ class KnowledgeBase:
356
  # Create new edge
357
  self.knowledge_graph.add_edge(login, filename, weight=count)
358
 
359
- # Add connections between files based on commit co-occurrence
360
  file_co_occurrence = defaultdict(int)
361
-
362
- for commit in commits:
363
- # Get all files in this commit
364
- commit_files = [file['filename'] for file in commit['files']]
365
-
366
- # Add co-occurrence for each pair of files
367
- for file1, file2 in combinations(commit_files, 2):
368
- if file1 in self.knowledge_graph and file2 in self.knowledge_graph:
369
- file_pair = tuple(sorted([file1, file2]))
370
- file_co_occurrence[file_pair] += 1
 
 
 
 
 
 
371
 
372
  # Add edges for file co-occurrence
373
  for (file1, file2), count in file_co_occurrence.items():
@@ -382,8 +589,9 @@ class KnowledgeBase:
382
  print(f"Error building knowledge graph: {e}")
383
  return nx.Graph()
384
 
 
385
  def search_similar_files(self, query: str, top_k: int = 5) -> List[Dict]:
386
- """Search for files similar to query"""
387
  try:
388
  if not self.index:
389
  return []
@@ -413,7 +621,12 @@ class KnowledgeBase:
413
  return []
414
 
415
  def extract_insights(self, repo_data: Dict, commits: List[Dict], contributors: Dict, issues: List[Dict]) -> Dict:
416
- """Extract insights from repository data"""
 
 
 
 
 
417
  try:
418
  insights = {
419
  'basic_stats': {},
@@ -423,21 +636,44 @@ class KnowledgeBase:
423
  'issues': {}
424
  }
425
 
 
 
 
426
  # Basic statistics
427
  insights['basic_stats'] = {
428
- 'name': repo_data['name'],
429
- 'description': repo_data['description'],
430
- 'stars': repo_data['stars'],
431
- 'forks': repo_data['forks'],
432
- 'age_days': (datetime.datetime.now() - repo_data['created_at']).days if repo_data['created_at'] else 0,
433
- 'primary_language': repo_data['language'],
434
- 'topics': repo_data['topics']
435
  }
 
 
 
 
 
 
 
 
 
 
 
436
 
437
  # Activity insights
438
  if commits:
439
- # Convert dates to datetime objects and sort
440
- commit_dates = [commit['date'] for commit in commits if commit['date']]
 
 
 
 
 
 
 
 
 
441
  commit_dates.sort()
442
 
443
  if commit_dates:
@@ -452,9 +688,13 @@ class KnowledgeBase:
452
  'last_commit': last_commit,
453
  'days_span': days_span,
454
  'commits_per_day': round(len(commits) / max(days_span, 1), 2),
455
- 'most_active_day': max(commit_dates, key=commit_dates.count) if commit_dates else None,
456
  }
457
 
 
 
 
 
 
458
  # Commit activity by month
459
  commit_months = [d.strftime('%Y-%m') for d in commit_dates]
460
  month_counts = Counter(commit_months)
@@ -537,7 +777,17 @@ class KnowledgeBase:
537
  close_times = []
538
  for issue in closed_issues:
539
  if issue['created_at'] and issue['closed_at']:
540
- close_time = (issue['closed_at'] - issue['created_at']).days
 
 
 
 
 
 
 
 
 
 
541
  close_times.append(close_time)
542
 
543
  if close_times:
@@ -550,26 +800,40 @@ class KnowledgeBase:
550
  {'label': label, 'count': count} for label, count in label_counts.most_common(5)
551
  ]
552
 
 
 
 
553
  self.insights = insights
 
554
  return insights
555
  except Exception as e:
 
556
  print(f"Error extracting insights: {e}")
 
557
  return {}
558
 
 
559
  # Main GitHub AI Agent Class
560
  class GitHubAIAgent:
561
  """Main class for GitHub AI Agent"""
562
  def __init__(self):
563
  self.config = Config()
564
- self.github_manager = GitHubManager(self.config)
565
- self.knowledge_base = KnowledgeBase(self.config)
566
- self.gemini_client = GeminiClient(self.config)
567
- self.visualization_manager = RepositoryVisualizer(self.config)
568
 
569
  self.repository_loaded = False
570
  self.repository_url = ""
571
  self.repository_analysis = {}
572
  self.visualizations = {}
 
 
 
 
 
 
 
573
 
574
  def set_api_keys(self, gemini_api_key: str, github_token: str = None) -> None:
575
  """Set API keys"""
@@ -582,13 +846,14 @@ class GitHubAIAgent:
582
  self.config.gemini_api_key = gemini_api_key
583
  self.config.github_token = github_token
584
 
585
- # Reinitialize clients
586
  self.github_manager = GitHubManager(self.config)
587
- self.gemini_client = GeminiClient(self.config)
588
- self.visualization_manager = RepositoryVisualizer(self.config) # Reinitialize
 
589
 
590
  def load_repository(self, repository_url: str) -> Dict:
591
- """Load and analyze a GitHub repository"""
592
  result = {
593
  'success': False,
594
  'message': '',
@@ -604,7 +869,7 @@ class GitHubAIAgent:
604
  self.repository_analysis = {}
605
  self.visualizations = {}
606
 
607
- # Load repository
608
  print(f"Loading repository: {repository_url}")
609
  repo_loaded = self.github_manager.load_repository(repository_url)
610
 
@@ -615,30 +880,27 @@ class GitHubAIAgent:
615
  # Store repository URL
616
  self.repository_url = repository_url
617
 
618
- # Load repository files
619
- print("Loading repository files")
620
- files = self.github_manager.load_files()
 
 
 
 
 
 
 
 
 
 
 
621
  result['file_count'] = len(files)
622
-
623
- # Load contributors
624
- print("Loading contributors")
625
- contributors = self.github_manager.load_contributors()
626
  result['contributor_count'] = len(contributors)
627
 
628
- # Load commits
629
- print("Loading commit history")
630
- commits = self.github_manager.load_commits()
631
-
632
- # Load issues
633
- print("Loading issues")
634
- issues = self.github_manager.load_issues()
635
-
636
- # Initialize vector storage
637
- print("Building vector embeddings")
638
  self.knowledge_base.initialize_vector_storage(files)
639
-
640
- # Build knowledge graph
641
- print("Building knowledge graph")
642
  knowledge_graph = self.knowledge_base.build_knowledge_graph(
643
  commits, self.github_manager.contributors_data
644
  )
@@ -652,31 +914,43 @@ class GitHubAIAgent:
652
  issues
653
  )
654
 
655
- # Analyze repository with Gemini
656
- print("Analyzing repository with Gemini")
657
- self.repository_analysis = self.gemini_client.analyze_repository(
658
- self.github_manager.repo_data,
659
- files,
660
- commits,
661
- self.github_manager.contributors_data,
662
- insights
663
- )
664
-
665
- # Create repository visualizations
666
- print("Creating repository visualizations")
667
- repo_graph_path = self.visualization_manager.create_repository_graph(knowledge_graph)
668
- activity_chart_path = self.visualization_manager.create_commit_activity_chart(commits)
669
- contributor_network_path = self.visualization_manager.create_contributor_network(
670
- self.github_manager.contributors_data, commits
671
- )
672
- dependency_graph_path = self.visualization_manager.create_file_dependency_graph(files)
673
-
674
- self.visualizations = {
675
- 'repository_graph': repo_graph_path,
676
- 'activity_chart': activity_chart_path,
677
- 'contributor_network': contributor_network_path,
678
- 'dependency_graph': dependency_graph_path,
679
- }
 
 
 
 
 
 
 
 
 
 
 
 
680
 
681
  # Update result
682
  result['success'] = True
@@ -687,11 +961,15 @@ class GitHubAIAgent:
687
 
688
  return result
689
  except Exception as e:
 
 
 
690
  result['message'] = f"Error loading repository: {str(e)}"
691
  return result
692
 
 
693
  def answer_query(self, query: str) -> Dict:
694
- """Answer a natural language query about the repository"""
695
  if not self.repository_loaded:
696
  return {
697
  'success': False,
@@ -699,6 +977,14 @@ class GitHubAIAgent:
699
  'answer': ""
700
  }
701
 
 
 
 
 
 
 
 
 
702
  try:
703
  # Search for relevant files
704
  similar_files = self.knowledge_base.search_similar_files(query)
@@ -711,12 +997,21 @@ class GitHubAIAgent:
711
  self.knowledge_base.insights
712
  )
713
 
714
- return {
715
  'success': True,
716
  'message': "Query answered successfully",
717
  'answer': answer,
718
  'relevant_files': [f['file'] for f in similar_files]
719
  }
 
 
 
 
 
 
 
 
 
720
  except Exception as e:
721
  return {
722
  'success': False,
@@ -725,7 +1020,7 @@ class GitHubAIAgent:
725
  }
726
 
727
  def analyze_code(self, file_path: str = "", code_snippet: str = "", language: str = "") -> Dict:
728
- """Analyze a code file or snippet"""
729
  if not file_path and not code_snippet:
730
  return {
731
  'success': False,
@@ -836,4 +1131,17 @@ class GitHubAIAgent:
836
  'success': True,
837
  'message': "Repository visualizations retrieved",
838
  'visualizations': self.visualizations
839
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # github_ai_agent.py - Improved version with parallel processing and error handling
2
 
3
  import os
4
  import re
 
10
  from itertools import combinations
11
  import numpy as np
12
  from typing import List, Dict, Tuple, Any, Optional, Union
13
+ import concurrent.futures
14
+ from functools import lru_cache
15
  import google.generativeai as genai
16
 
17
  # External libraries
18
  from github import Github, GithubException
19
  from sentence_transformers import SentenceTransformer
20
  import faiss
21
+ from gemini_integration import GeminiClient
22
+ from visualization_module import RepositoryVisualizer
23
 
24
 
25
  # Configuration
 
29
  self.gemini_api_key = os.environ.get("GEMINI_API_KEY")
30
  self.github_token = os.environ.get("GITHUB_ACCESS_TOKEN")
31
  self.embedding_model_name = "all-MiniLM-L6-v2"
32
+ self.gemini_model = "gemini-2.0-pro-exp-02-05"
33
  self.max_files_to_load = 100 # Safety limit for large repos
34
  self.max_token_length = 64000 # Gemini Pro context limit
35
  self.enable_advanced_metrics = True
36
  self.visualization_node_limit = 150
37
+ self.cache_enabled = True
38
+ self.cache_ttl = 3600 # Cache time to live in seconds
39
 
40
  # File extensions to analyze
41
  self.code_extensions = [
 
46
  '.md', '.txt', '.rst', '.html', '.xml', '.json', '.yaml', '.yml'
47
  ]
48
 
49
+
50
  # GitHub Repository Management
51
  class GitHubManager:
52
  """Manages interaction with GitHub repositories"""
 
59
  self.contributors_data = {}
60
  self.commit_history = []
61
  self.issues_data = []
62
+ self.file_cache = {} # Cache for loaded files
63
 
64
  def load_repository(self, repo_url: str) -> bool:
65
  """Load a repository from URL"""
 
106
  return f"{username}/{repo}"
107
  return None
108
 
109
+ def load_files(self) -> Dict[str, Dict]:
110
+ """Load files from repository with improved performance"""
111
  if not self.current_repo:
112
  return {}
113
 
 
115
  contents = self.current_repo.get_contents("")
116
  self.file_contents = {}
117
  files_loaded = 0
118
+ batch_size = 20 # Process files in batches
119
+
120
+ # Create a queue of files to process
121
+ file_queue = []
122
+
123
+ # First pass - collect all file paths
124
+ while contents:
125
+ content_item = contents.pop(0)
126
+
127
+ # Skip directories but add their contents to our processing queue
128
+ if content_item.type == "dir":
129
+ try:
130
+ dir_contents = self.current_repo.get_contents(content_item.path)
131
+ contents.extend(dir_contents)
132
+ except Exception as e:
133
+ print(f"Error accessing directory {content_item.path}: {e}")
134
  continue
135
+
136
  # Filter by extensions
137
+ _, ext = os.path.splitext(content_item.path)
138
  if ext not in self.config.code_extensions + self.config.doc_extensions:
139
  continue
140
+
141
+ # Add file to processing queue
142
+ file_queue.append(content_item)
143
+
144
+ # Stop if we've reached our limit
145
+ if len(file_queue) >= self.config.max_files_to_load:
146
+ break
147
+
148
+ # Process files in batches
149
+ for i in range(0, len(file_queue), batch_size):
150
+ batch = file_queue[i:i+batch_size]
151
+
152
+ # Process batch in parallel
153
+ with concurrent.futures.ThreadPoolExecutor() as executor:
154
+ future_to_file = {
155
+ executor.submit(self._process_file, file_content): file_content
156
+ for file_content in batch
157
+ }
158
+
159
+ for future in concurrent.futures.as_completed(future_to_file):
160
+ file_content = future_to_file[future]
161
+ try:
162
+ result = future.result()
163
+ if result:
164
+ self.file_contents[file_content.path] = result
165
+ files_loaded += 1
166
+ except Exception as e:
167
+ print(f"Error processing file {file_content.path}: {e}")
168
 
169
  return self.file_contents
170
  except Exception as e:
171
  print(f"Error loading files: {e}")
172
  return {}
173
 
174
+ def _process_file(self, file_content) -> Optional[Dict]:
175
+ """Process a single file (for parallel execution)"""
176
+ try:
177
+ # Check if in cache
178
+ if file_content.path in self.file_cache:
179
+ return self.file_cache[file_content.path]
180
+
181
+ _, ext = os.path.splitext(file_content.path)
182
+
183
+ # Only process text files with specified extensions
184
+ if ext not in self.config.code_extensions + self.config.doc_extensions:
185
+ return None
186
+
187
+ try:
188
+ # Decode content
189
+ decoded_content = file_content.decoded_content.decode('utf-8')
190
+ result = {
191
+ 'content': decoded_content,
192
+ 'type': 'code' if ext in self.config.code_extensions else 'document',
193
+ 'size': file_content.size,
194
+ 'ext': ext
195
+ }
196
+
197
+ # Update cache
198
+ self.file_cache[file_content.path] = result
199
+ return result
200
+ except UnicodeDecodeError:
201
+ # Skip binary files
202
+ return None
203
+
204
+ except Exception as e:
205
+ print(f"Error processing file {file_content.path}: {e}")
206
+ return None
207
+
208
  def load_contributors(self) -> List[Dict]:
209
+ """Load repository contributors with improved performance"""
210
  if not self.current_repo:
211
  return []
212
 
 
214
  contributors = self.current_repo.get_contributors()
215
  self.contributors_data = {}
216
 
217
+ # Collect basic contributor info
218
+ contributor_list = list(contributors) # Convert from PaginatedList to list
219
+
220
+ # Process in parallel
221
+ with concurrent.futures.ThreadPoolExecutor() as executor:
222
+ future_to_contributor = {
223
+ executor.submit(self._process_contributor, contributor): contributor
224
+ for contributor in contributor_list
 
 
 
225
  }
226
+
227
+ for future in concurrent.futures.as_completed(future_to_contributor):
228
+ contributor = future_to_contributor[future]
229
+ try:
230
+ contributor_data = future.result()
231
+ if contributor_data:
232
+ self.contributors_data[contributor.login] = contributor_data
233
+ except Exception as e:
234
+ print(f"Error processing contributor {contributor.login}: {e}")
235
 
236
  return list(self.contributors_data.values())
237
  except Exception as e:
238
  print(f"Error loading contributors: {e}")
239
  return []
240
+
241
+ def _process_contributor(self, contributor) -> Dict:
242
+ """Process a single contributor (for parallel execution)"""
243
+ try:
244
+ return {
245
+ 'login': contributor.login,
246
+ 'id': contributor.id,
247
+ 'contributions': contributor.contributions,
248
+ 'avatar_url': contributor.avatar_url,
249
+ 'html_url': contributor.html_url,
250
+ 'type': contributor.type,
251
+ 'files_modified': [],
252
+ 'commit_messages': [],
253
+ 'activity_dates': []
254
+ }
255
+ except Exception as e:
256
+ print(f"Error processing contributor {contributor.login}: {e}")
257
+ return None
258
 
259
  def load_commits(self, limit: int = 100) -> List[Dict]:
260
+ """Load repository commits with improved performance"""
261
  if not self.current_repo:
262
  return []
263
 
264
  try:
265
  commits = self.current_repo.get_commits()[:limit]
266
  self.commit_history = []
267
+ commits_list = list(commits) # Convert from PaginatedList to list
268
+
269
+ # Process commits in parallel
270
+ with concurrent.futures.ThreadPoolExecutor() as executor:
271
+ future_to_commit = {
272
+ executor.submit(self._process_commit, commit): commit
273
+ for commit in commits_list
 
274
  }
275
+
276
+ for future in concurrent.futures.as_completed(future_to_commit):
277
+ commit = future_to_commit[future]
278
+ try:
279
+ commit_data = future.result()
280
+ if commit_data:
281
+ self.commit_history.append(commit_data)
282
+ except Exception as e:
283
+ print(f"Error processing commit {commit.sha}: {e}")
284
+
285
+ # Process contributor file statistics
286
+ self._update_contributor_file_stats()
287
+
288
+ return self.commit_history
289
+ except Exception as e:
290
+ print(f"Error loading commits: {e}")
291
+ return []
292
+
293
+ def _process_commit(self, commit) -> Optional[Dict]:
294
+ """Process a single commit (for parallel execution)"""
295
+ try:
296
+ # Make sure the commit date is timezone-naive
297
+ commit_date = commit.commit.author.date
298
+ if hasattr(commit_date, 'tzinfo') and commit_date.tzinfo:
299
+ commit_date = commit_date.replace(tzinfo=None)
300
+
301
+ commit_data = {
302
+ 'sha': commit.sha,
303
+ 'author': commit.author.login if commit.author else 'Unknown',
304
+ 'date': commit_date,
305
+ 'message': commit.commit.message,
306
+ 'files': []
307
+ }
308
+
309
+ # Get files changed in this commit
310
+ try:
311
+ commit_files = commit.files
312
+ for file in commit_files:
313
+ file_data = {
314
+ 'filename': file.filename,
315
+ 'additions': file.additions,
316
+ 'deletions': file.deletions,
317
+ 'changes': file.changes,
318
+ 'status': file.status
319
+ }
320
+ commit_data['files'].append(file_data)
321
+
322
+ # Add this file to the contributor's file list
323
+ if commit.author and commit.author.login in self.contributors_data:
324
+ self.contributors_data[commit.author.login]['files_modified'].append(file.filename)
325
+ self.contributors_data[commit.author.login]['commit_messages'].append(commit.commit.message)
326
+ self.contributors_data[commit.author.login]['activity_dates'].append(commit_date)
327
+ except Exception as e:
328
+ print(f"Error processing files for commit {commit.sha}: {e}")
329
 
330
+ return commit_data
331
+ except Exception as e:
332
+ print(f"Error processing commit {commit.sha}: {e}")
333
+ return None
334
+
335
+ def _update_contributor_file_stats(self):
336
+ """Update contributor file statistics"""
337
+ for login, contributor in self.contributors_data.items():
338
+ if 'files_modified' in contributor:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  # Count occurrences of each file
340
  file_counts = Counter(contributor['files_modified'])
341
  # Replace list with a list of (filename, count) tuples
 
344
  for filename, count in file_counts.most_common(10)
345
  ]
346
 
 
 
 
 
 
347
  def load_issues(self, limit: int = 30) -> List[Dict]:
348
+ """Load repository issues with improved performance"""
349
  if not self.current_repo:
350
  return []
351
 
352
  try:
353
  issues = self.current_repo.get_issues(state='all')[:limit]
354
  self.issues_data = []
355
+ issues_list = list(issues) # Convert from PaginatedList to list
356
+
357
+ # Process issues in parallel
358
+ with concurrent.futures.ThreadPoolExecutor() as executor:
359
+ future_to_issue = {
360
+ executor.submit(self._process_issue, issue): issue
361
+ for issue in issues_list
 
 
 
 
 
 
362
  }
363
+
364
+ for future in concurrent.futures.as_completed(future_to_issue):
365
+ issue = future_to_issue[future]
366
+ try:
367
+ issue_data = future.result()
368
+ if issue_data:
369
+ self.issues_data.append(issue_data)
370
+ except Exception as e:
371
+ print(f"Error processing issue #{issue.number}: {e}")
 
 
 
 
 
372
 
373
  return self.issues_data
374
  except Exception as e:
375
  print(f"Error loading issues: {e}")
376
  return []
377
+
378
+ def _process_issue(self, issue) -> Optional[Dict]:
379
+ """Process a single issue (for parallel execution)"""
380
+ try:
381
+ # Normalize datetime objects
382
+ created_at = issue.created_at
383
+ updated_at = issue.updated_at
384
+ closed_at = issue.closed_at
385
+
386
+ if hasattr(created_at, 'tzinfo') and created_at.tzinfo:
387
+ created_at = created_at.replace(tzinfo=None)
388
+ if hasattr(updated_at, 'tzinfo') and updated_at.tzinfo:
389
+ updated_at = updated_at.replace(tzinfo=None)
390
+ if hasattr(closed_at, 'tzinfo') and closed_at and closed_at.tzinfo:
391
+ closed_at = closed_at.replace(tzinfo=None)
392
+
393
+ issue_data = {
394
+ 'number': issue.number,
395
+ 'title': issue.title,
396
+ 'body': issue.body,
397
+ 'user': issue.user.login if issue.user else 'Unknown',
398
+ 'state': issue.state,
399
+ 'created_at': created_at,
400
+ 'updated_at': updated_at,
401
+ 'closed_at': closed_at,
402
+ 'labels': [label.name for label in issue.labels],
403
+ 'comments': []
404
+ }
405
+
406
+ # Get comments for this issue (limited to 10)
407
+ try:
408
+ comments = issue.get_comments()[:10]
409
+ for comment in comments:
410
+ # Normalize datetime
411
+ comment_created_at = comment.created_at
412
+ if hasattr(comment_created_at, 'tzinfo') and comment_created_at.tzinfo:
413
+ comment_created_at = comment_created_at.replace(tzinfo=None)
414
+
415
+ issue_data['comments'].append({
416
+ 'user': comment.user.login if comment.user else 'Unknown',
417
+ 'body': comment.body,
418
+ 'created_at': comment_created_at
419
+ })
420
+ except Exception as e:
421
+ print(f"Error loading comments for issue #{issue.number}: {e}")
422
+
423
+ return issue_data
424
+ except Exception as e:
425
+ print(f"Error processing issue #{issue.number}: {e}")
426
+ return None
427
+
428
 
429
  # Knowledge Base and Vector Storage
430
  class KnowledgeBase:
 
436
  self.index = None
437
  self.knowledge_graph = nx.Graph()
438
  self.insights = {}
439
+ self.insights_cache = {}
440
+ self.cache_timestamp = None
441
 
442
  def initialize_vector_storage(self, file_contents: Dict[str, Dict]) -> None:
443
+ """Initialize vector storage with file contents and batched processing"""
444
  try:
445
  # Clear existing data
446
  self.embeddings = {}
 
450
  texts = []
451
  ids = []
452
 
453
+ # Process files in parallel for large repositories
454
+ if len(file_contents) > 50:
455
+ with concurrent.futures.ThreadPoolExecutor() as executor:
456
+ # Process files in batches
457
+ batch_size = 20
458
+ keys = list(file_contents.keys())
459
+ batches = [keys[i:i + batch_size] for i in range(0, len(keys), batch_size)]
460
+
461
+ # Create a function to process a batch
462
+ def process_batch(batch_keys):
463
+ batch_texts = []
464
+ batch_ids = []
465
+ for path in batch_keys:
466
+ file_data = file_contents[path]
467
+ content = file_data['content']
468
+
469
+ # Skip very large files to avoid embedding issues
470
+ if len(content) > 10000:
471
+ content = content[:10000] + "..."
472
+
473
+ batch_texts.append(content)
474
+ batch_ids.append(path)
475
+ return batch_texts, batch_ids
476
+
477
+ # Submit batch processing tasks
478
+ futures = [executor.submit(process_batch, batch) for batch in batches]
479
+
480
+ # Collect results
481
+ for future in concurrent.futures.as_completed(futures):
482
+ batch_texts, batch_ids = future.result()
483
+ texts.extend(batch_texts)
484
+ ids.extend(batch_ids)
485
+ else:
486
+ # For smaller repositories, process sequentially
487
+ for path, file_data in file_contents.items():
488
+ content = file_data['content']
489
+
490
+ # Skip very large files to avoid embedding issues
491
+ if len(content) > 10000:
492
+ content = content[:10000] + "..."
493
+
494
+ texts.append(content)
495
+ ids.append(path)
496
+
497
+ # Add nodes to knowledge graph
498
  for path, file_data in file_contents.items():
 
 
 
 
 
 
 
 
 
 
 
 
499
  self.knowledge_graph.add_node(
500
  path,
501
  type='file',
502
+ file_type=file_data.get('type', 'unknown'),
503
+ size=file_data.get('size', 0),
504
+ extension=file_data.get('ext', '')
505
  )
506
 
507
  # Create embeddings for all files
508
  if texts:
509
+ # Process embeddings in batches to avoid memory issues
510
+ batch_size = 32
511
+ file_embeddings = []
512
+
513
+ for i in range(0, len(texts), batch_size):
514
+ batch_texts = texts[i:i+batch_size]
515
+ batch_embeddings = self.embedding_model.encode(batch_texts)
516
+ file_embeddings.append(batch_embeddings)
517
+
518
+ file_embeddings = np.vstack(file_embeddings)
519
 
520
  # Initialize FAISS index
521
  dimension = file_embeddings.shape[1]
 
557
  # Create new edge
558
  self.knowledge_graph.add_edge(login, filename, weight=count)
559
 
560
+ # Optimized co-occurrence calculation
561
  file_co_occurrence = defaultdict(int)
562
+
563
+ # Process in batches for large commit histories
564
+ batch_size = 50
565
+ for i in range(0, len(commits), batch_size):
566
+ batch_commits = commits[i:i+batch_size]
567
+
568
+ for commit in batch_commits:
569
+ # Get all files in this commit
570
+ commit_files = [file['filename'] for file in commit['files']]
571
+
572
+ # Add co-occurrence for each pair of files
573
+ from itertools import combinations
574
+ for file1, file2 in combinations(commit_files, 2):
575
+ if file1 in self.knowledge_graph and file2 in self.knowledge_graph:
576
+ file_pair = tuple(sorted([file1, file2]))
577
+ file_co_occurrence[file_pair] += 1
578
 
579
  # Add edges for file co-occurrence
580
  for (file1, file2), count in file_co_occurrence.items():
 
589
  print(f"Error building knowledge graph: {e}")
590
  return nx.Graph()
591
 
592
+ @lru_cache(maxsize=32)
593
  def search_similar_files(self, query: str, top_k: int = 5) -> List[Dict]:
594
+ """Search for files similar to query with caching"""
595
  try:
596
  if not self.index:
597
  return []
 
621
  return []
622
 
623
  def extract_insights(self, repo_data: Dict, commits: List[Dict], contributors: Dict, issues: List[Dict]) -> Dict:
624
+ """Extract insights from repository data with datetime fix and caching"""
625
+ # Check if we have a recent cache (less than 10 minutes old)
626
+ current_time = time.time()
627
+ if self.cache_timestamp and (current_time - self.cache_timestamp < 600) and self.insights_cache:
628
+ return self.insights_cache
629
+
630
  try:
631
  insights = {
632
  'basic_stats': {},
 
636
  'issues': {}
637
  }
638
 
639
+ # Make a deep copy of repo_data to avoid modifying the original
640
+ repo_data_copy = {k: v for k, v in repo_data.items()}
641
+
642
  # Basic statistics
643
  insights['basic_stats'] = {
644
+ 'name': repo_data_copy['name'],
645
+ 'description': repo_data_copy['description'],
646
+ 'stars': repo_data_copy['stars'],
647
+ 'forks': repo_data_copy['forks'],
648
+ 'age_days': None, # Will calculate below
649
+ 'primary_language': repo_data_copy['language'],
650
+ 'topics': repo_data_copy['topics']
651
  }
652
+
653
+ # Fix: Normalize datetime objects to be timezone-naive for consistent comparison
654
+ created_at = repo_data_copy.get('created_at')
655
+ if created_at:
656
+ # Remove timezone info if present
657
+ if hasattr(created_at, 'tzinfo') and created_at.tzinfo:
658
+ created_at = created_at.replace(tzinfo=None)
659
+
660
+ # Calculate age
661
+ now = datetime.datetime.now()
662
+ insights['basic_stats']['age_days'] = (now - created_at).days
663
 
664
  # Activity insights
665
  if commits:
666
+ # Fix: Normalize all datetime objects to be timezone-naive
667
+ commit_dates = []
668
+ for commit in commits:
669
+ date = commit.get('date')
670
+ if date:
671
+ # Remove timezone info if present
672
+ if hasattr(date, 'tzinfo') and date.tzinfo:
673
+ date = date.replace(tzinfo=None)
674
+ commit_dates.append(date)
675
+
676
+ # Sort dates
677
  commit_dates.sort()
678
 
679
  if commit_dates:
 
688
  'last_commit': last_commit,
689
  'days_span': days_span,
690
  'commits_per_day': round(len(commits) / max(days_span, 1), 2),
 
691
  }
692
 
693
+ # Fix: Use Counter for most active day calculation
694
+ date_counter = Counter(d.date() for d in commit_dates)
695
+ if date_counter:
696
+ insights['activity']['most_active_day'] = date_counter.most_common(1)[0][0]
697
+
698
  # Commit activity by month
699
  commit_months = [d.strftime('%Y-%m') for d in commit_dates]
700
  month_counts = Counter(commit_months)
 
777
  close_times = []
778
  for issue in closed_issues:
779
  if issue['created_at'] and issue['closed_at']:
780
+ # Fix: Normalize datetime objects to be timezone-naive
781
+ created_at = issue['created_at']
782
+ closed_at = issue['closed_at']
783
+
784
+ if hasattr(created_at, 'tzinfo') and created_at.tzinfo:
785
+ created_at = created_at.replace(tzinfo=None)
786
+
787
+ if hasattr(closed_at, 'tzinfo') and closed_at.tzinfo:
788
+ closed_at = closed_at.replace(tzinfo=None)
789
+
790
+ close_time = (closed_at - created_at).days
791
  close_times.append(close_time)
792
 
793
  if close_times:
 
800
  {'label': label, 'count': count} for label, count in label_counts.most_common(5)
801
  ]
802
 
803
+ # Update cache
804
+ self.insights_cache = insights
805
+ self.cache_timestamp = current_time
806
  self.insights = insights
807
+
808
  return insights
809
  except Exception as e:
810
+ import traceback
811
  print(f"Error extracting insights: {e}")
812
+ print(traceback.format_exc())
813
  return {}
814
 
815
+
816
  # Main GitHub AI Agent Class
817
  class GitHubAIAgent:
818
  """Main class for GitHub AI Agent"""
819
  def __init__(self):
820
  self.config = Config()
821
+ self.github_manager = None
822
+ self.knowledge_base = None
823
+ self.gemini_client = None
824
+ self.visualization_manager = None
825
 
826
  self.repository_loaded = False
827
  self.repository_url = ""
828
  self.repository_analysis = {}
829
  self.visualizations = {}
830
+
831
+ # Initialize caches
832
+ self.file_cache = {}
833
+ self.contributor_cache = {}
834
+ self.commit_cache = {}
835
+ self.issue_cache = {}
836
+ self.query_cache = {}
837
 
838
  def set_api_keys(self, gemini_api_key: str, github_token: str = None) -> None:
839
  """Set API keys"""
 
846
  self.config.gemini_api_key = gemini_api_key
847
  self.config.github_token = github_token
848
 
849
+ # Initialize clients
850
  self.github_manager = GitHubManager(self.config)
851
+ self.knowledge_base = KnowledgeBase(self.config)
852
+ self.gemini_client = GeminiClient(self.config.gemini_api_key, self.config.gemini_model)
853
+ self.visualization_manager = RepositoryVisualizer(self.config)
854
 
855
  def load_repository(self, repository_url: str) -> Dict:
856
+ """Load and analyze a GitHub repository with improved parallelization"""
857
  result = {
858
  'success': False,
859
  'message': '',
 
869
  self.repository_analysis = {}
870
  self.visualizations = {}
871
 
872
+ # Load repository basic info
873
  print(f"Loading repository: {repository_url}")
874
  repo_loaded = self.github_manager.load_repository(repository_url)
875
 
 
880
  # Store repository URL
881
  self.repository_url = repository_url
882
 
883
+ # Use parallel processing for loading repository data
884
+ with concurrent.futures.ThreadPoolExecutor() as executor:
885
+ # Submit tasks
886
+ files_future = executor.submit(self.github_manager.load_files)
887
+ contributors_future = executor.submit(self.github_manager.load_contributors)
888
+ commits_future = executor.submit(self.github_manager.load_commits)
889
+ issues_future = executor.submit(self.github_manager.load_issues)
890
+
891
+ # Get results
892
+ files = files_future.result()
893
+ contributors = contributors_future.result()
894
+ commits = commits_future.result()
895
+ issues = issues_future.result()
896
+
897
  result['file_count'] = len(files)
 
 
 
 
898
  result['contributor_count'] = len(contributors)
899
 
900
+ # Initialize vector storage and build knowledge graph
901
+ # (These are kept sequential as they depend on previous steps)
902
+ print("Building knowledge base")
 
 
 
 
 
 
 
903
  self.knowledge_base.initialize_vector_storage(files)
 
 
 
904
  knowledge_graph = self.knowledge_base.build_knowledge_graph(
905
  commits, self.github_manager.contributors_data
906
  )
 
914
  issues
915
  )
916
 
917
+ # Use a separate thread for Gemini analysis which can be slower
918
+ # and doesn't block the main thread
919
+ def analyze_with_gemini():
920
+ print("Analyzing repository with Gemini")
921
+ return self.gemini_client.analyze_repository(
922
+ self.github_manager.repo_data,
923
+ files,
924
+ commits,
925
+ self.github_manager.contributors_data,
926
+ insights
927
+ )
928
+
929
+ # Use another thread pool for visualization generation
930
+ def create_visualizations():
931
+ print("Creating repository visualizations")
932
+ repo_graph_path = self.visualization_manager.create_repository_graph(knowledge_graph)
933
+ activity_chart_path = self.visualization_manager.create_commit_activity_chart(commits)
934
+ contributor_network_path = self.visualization_manager.create_contributor_network(
935
+ self.github_manager.contributors_data, commits
936
+ )
937
+ dependency_graph_path = self.visualization_manager.create_file_dependency_graph(files)
938
+
939
+ return {
940
+ 'repository_graph': repo_graph_path,
941
+ 'activity_chart': activity_chart_path,
942
+ 'contributor_network': contributor_network_path,
943
+ 'dependency_graph': dependency_graph_path,
944
+ }
945
+
946
+ # Run Gemini analysis and visualization generation in parallel
947
+ with concurrent.futures.ThreadPoolExecutor() as executor:
948
+ analysis_future = executor.submit(analyze_with_gemini)
949
+ viz_future = executor.submit(create_visualizations)
950
+
951
+ # Get results
952
+ self.repository_analysis = analysis_future.result()
953
+ self.visualizations = viz_future.result()
954
 
955
  # Update result
956
  result['success'] = True
 
961
 
962
  return result
963
  except Exception as e:
964
+ import traceback
965
+ print(f"Error loading repository: {str(e)}")
966
+ print(traceback.format_exc())
967
  result['message'] = f"Error loading repository: {str(e)}"
968
  return result
969
 
970
+ @lru_cache(maxsize=32)
971
  def answer_query(self, query: str) -> Dict:
972
+ """Answer a natural language query about the repository with caching"""
973
  if not self.repository_loaded:
974
  return {
975
  'success': False,
 
977
  'answer': ""
978
  }
979
 
980
+ # Check cache if enabled
981
+ cache_key = f"query_{hash(query)}"
982
+ if self.config.cache_enabled and cache_key in self.query_cache:
983
+ cached_result = self.query_cache[cache_key]
984
+ # Check if cache is still valid
985
+ if time.time() - cached_result['timestamp'] < self.config.cache_ttl:
986
+ return cached_result['result']
987
+
988
  try:
989
  # Search for relevant files
990
  similar_files = self.knowledge_base.search_similar_files(query)
 
997
  self.knowledge_base.insights
998
  )
999
 
1000
+ result = {
1001
  'success': True,
1002
  'message': "Query answered successfully",
1003
  'answer': answer,
1004
  'relevant_files': [f['file'] for f in similar_files]
1005
  }
1006
+
1007
+ # Update cache
1008
+ if self.config.cache_enabled:
1009
+ self.query_cache[cache_key] = {
1010
+ 'result': result,
1011
+ 'timestamp': time.time()
1012
+ }
1013
+
1014
+ return result
1015
  except Exception as e:
1016
  return {
1017
  'success': False,
 
1020
  }
1021
 
1022
  def analyze_code(self, file_path: str = "", code_snippet: str = "", language: str = "") -> Dict:
1023
+ """Analyze a code file or snippet with improved error handling"""
1024
  if not file_path and not code_snippet:
1025
  return {
1026
  'success': False,
 
1131
  'success': True,
1132
  'message': "Repository visualizations retrieved",
1133
  'visualizations': self.visualizations
1134
+ }
1135
+
1136
+ def clear_caches(self) -> None:
1137
+ """Clear all caches"""
1138
+ self.file_cache.clear()
1139
+ self.contributor_cache.clear()
1140
+ self.commit_cache.clear()
1141
+ self.issue_cache.clear()
1142
+ self.query_cache.clear()
1143
+
1144
+ # Clear LRU caches
1145
+ self.answer_query.cache_clear()
1146
+ if hasattr(self.knowledge_base, 'search_similar_files'):
1147
+ self.knowledge_base.search_similar_files.cache_clear()