import statistics from git import Repo from collections import defaultdict, Counter from datetime import datetime, timedelta import os def get_contributor_stats(repo_path, start_date=None, end_date=None, branch='HEAD'): """ 获取仓库贡献者的详细统计信息 Args: repo_path: Git仓库路径 start_date: 开始日期(可选) end_date: 结束日期(可选) branch: 要分析的分支(默认为HEAD) Returns: 包含每个贡献者详细统计信息的字典 """ # 初始化仓库对象 repo = Repo(repo_path) # 存储统计结果 stats = defaultdict(lambda: { 'additions': 0, # 添加的行数 'deletions': 0, # 删除的行数 'commits': 0, # 提交次数 'files_modified': set(), # 修改过的文件集合 'file_types': defaultdict(int),# 各类型文件的修改次数 'commit_dates': set(), # 提交日期集合 'commit_hours': defaultdict(int), # 提交小时分布 'commit_weekdays': defaultdict(int), # 提交工作日分布 'largest_commit': 0, # 最大单次提交修改量 'first_commit': None, # 首次提交时间 'last_commit': None, # 最近提交时间 'commit_sizes': [], # 每次提交的大小,用于计算平均值和中位数 'commit_messages': [], # 提交消息列表 'commit_message_lengths': [], # 提交消息长度列表 'directories_modified': set(), # 修改过的目录集合 'co_authors': set(), # 合作者集合 'impact_score': 0, # 影响力得分 'complexity_score': 0, # 复杂度得分 'commit_by_month': defaultdict(int), # 按月份统计的提交次数 'commit_by_quarter': defaultdict(int), # 按季度统计的提交次数 'commit_by_year': defaultdict(int), # 按年份统计的提交次数 'commit_by_week': defaultdict(int), # 按周统计的提交次数 'file_operations': { # 文件操作统计 'created': set(), # 创建的文件 'deleted': set(), # 删除的文件 'modified': set(), # 修改的文件 }, 'review_comments': 0, # 代码审查评论数(如果可用) 'merge_commits': 0, # 合并提交数 'commit_streak': 0, # 最长连续提交天数 'current_streak': 0, # 当前连续提交天数 'contribution_days': [], # 所有贡献的日期列表(用于热图) 'code_churn': 0, # 代码周转率(添加后又删除的代码) 'file_ownership': {}, # 文件所有权百分比 'key_files_modified': set(), # 修改过的关键文件 'refactoring_commits': 0, # 重构提交数(基于提交消息分析) 'bug_fix_commits': 0, # 修复bug的提交数 'feature_commits': 0, # 新功能提交数 'documentation_commits': 0, # 文档相关提交数 'commit_size_distribution': defaultdict(int), # 提交大小分布 'collaboration_score': 0, # 协作得分 'consistency_score': 0, # 一致性得分 'expertise_areas': defaultdict(float), # 专业领域(目录/语言) }) # 存储所有文件的修改者,用于计算协作指标 file_authors = defaultdict(set) # 存储项目文件的重要性权重 (基于修改频率) file_importance = Counter() # 存储用于检测关键词的正则表达式 import re refactor_pattern = re.compile(r'refactor|重构', re.IGNORECASE) bugfix_pattern = re.compile(r'fix|修复|bug|问题|issue|错误', re.IGNORECASE) feature_pattern = re.compile(r'feature|功能|新增|add|实现', re.IGNORECASE) docs_pattern = re.compile(r'doc|文档|注释|comment', re.IGNORECASE) # 记录每位贡献者的提交日期,用于计算连续贡献天数 author_commit_days = defaultdict(set) # 定义关键文件路径模式 (可以根据项目自定义) key_file_patterns = [ re.compile(r'package\.json$'), re.compile(r'docker-compose\.yml$'), re.compile(r'Dockerfile$'), re.compile(r'tsconfig\..*\.json$'), re.compile(r'/src/index\.[jt]s$'), re.compile(r'README\.md$'), re.compile(r'\.env'), re.compile(r'/main\.[jt]s$'), re.compile(r'/app\.[jt]s$'), ] # 遍历所有提交 for commit in repo.iter_commits(branch): # 过滤日期 commit_date = datetime.fromtimestamp(commit.committed_date) if start_date and commit_date < start_date: continue if end_date and commit_date > end_date: continue author = commit.author.name stats[author]['commits'] += 1 # 记录提交日期和时间 commit_day = commit_date.date() stats[author]['commit_dates'].add(commit_day) stats[author]['contribution_days'].append(commit_day) # 用于热图 stats[author]['commit_hours'][commit_date.hour] += 1 stats[author]['commit_weekdays'][commit_date.weekday()] += 1 # 添加到作者的提交日集合 author_commit_days[author].add(commit_day) # 按时间段统计 year = commit_date.year month = commit_date.month quarter = (month - 1) // 3 + 1 week_num = commit_date.isocalendar()[1] stats[author]['commit_by_year'][year] += 1 stats[author]['commit_by_month'][f"{year}-{month:02d}"] += 1 stats[author]['commit_by_quarter'][f"{year}-Q{quarter}"] += 1 stats[author]['commit_by_week'][f"{year}-W{week_num:02d}"] += 1 # 分析提交消息,对提交进行分类 commit_message = commit.message.strip() if refactor_pattern.search(commit_message): stats[author]['refactoring_commits'] += 1 if bugfix_pattern.search(commit_message): stats[author]['bug_fix_commits'] += 1 if feature_pattern.search(commit_message): stats[author]['feature_commits'] += 1 if docs_pattern.search(commit_message): stats[author]['documentation_commits'] += 1 # 记录首次和最近提交 if stats[author]['first_commit'] is None or commit_date < stats[author]['first_commit']: stats[author]['first_commit'] = commit_date if stats[author]['last_commit'] is None or commit_date > stats[author]['last_commit']: stats[author]['last_commit'] = commit_date # 记录提交消息 commit_message = commit.message.strip() stats[author]['commit_messages'].append(commit_message) stats[author]['commit_message_lengths'].append(len(commit_message)) # 检测是否为合并提交 if len(commit.parents) > 1: stats[author]['merge_commits'] += 1 # 统计添加和删除的行数 total_changes = 0 modified_files = set() created_files = set() deleted_files = set() directories = set() # 尝试获取提交前后的差异,以确定文件操作类型 try: if commit.parents: parent = commit.parents[0] diffs = parent.diff(commit) for diff_item in diffs: if diff_item.new_file: if diff_item.b_path: created_files.add(diff_item.b_path) elif diff_item.deleted_file: if diff_item.a_path: deleted_files.add(diff_item.a_path) else: if diff_item.a_path: modified_files.add(diff_item.a_path) else: # 对于首次提交,所有文件都是新创建的 for file_path in commit.stats.files: created_files.add(file_path) except Exception as e: # 如果获取差异失败,退回到简单的文件修改统计 modified_files = set(commit.stats.files.keys()) for file_path, item in commit.stats.files.items(): # 统计文件类型 _, ext = os.path.splitext(file_path) if ext: # 确保扩展名不为空 stats[author]['file_types'][ext] += 1 else: stats[author]['file_types']['no_extension'] += 1 # 记录目录 directory = os.path.dirname(file_path) if directory: directories.add(directory) # 记录修改的文件 modified_files.add(file_path) # 记录文件的修改者,用于计算协作指标 file_authors[file_path].add(author) # 统计添加和删除的行数 stats[author]['additions'] += item['insertions'] stats[author]['deletions'] += item['deletions'] total_changes += item['insertions'] + item['deletions'] # 更新修改过的文件和目录集合 stats[author]['files_modified'].update(modified_files) stats[author]['directories_modified'].update(directories) stats[author]['file_operations']['created'].update(created_files) stats[author]['file_operations']['deleted'].update(deleted_files) stats[author]['file_operations']['modified'].update(modified_files - created_files - deleted_files) # 记录本次提交的修改量 stats[author]['commit_sizes'].append(total_changes) # 记录提交大小分布 commit_size_category = "小型(1-10行)" if total_changes <= 10 else \ "中型(11-100行)" if total_changes <= 100 else \ "大型(101-500行)" if total_changes <= 500 else \ "超大型(500+行)" stats[author]['commit_size_distribution'][commit_size_category] += 1 # 更新最大单次提交修改量 if total_changes > stats[author]['largest_commit']: stats[author]['largest_commit'] = total_changes # 检查修改的文件是否为关键文件 for file_path in modified_files: for pattern in key_file_patterns: if pattern.search(file_path): stats[author]['key_files_modified'].add(file_path) break # 更新文件重要性权重 for file_path in modified_files: file_importance[file_path] += 1 # 计算影响力得分 (基于修改的文件数和总修改行数) impact = total_changes * len(modified_files) / 100 if modified_files else 0 stats[author]['impact_score'] += impact # 计算文件协作度和文件所有权 for file_path, authors in file_authors.items(): # 如果只有一个作者修改了文件,则该作者100%拥有此文件 if len(authors) == 1: author = next(iter(authors)) if 'file_ownership' not in stats[author]: stats[author]['file_ownership'] = {} stats[author]['file_ownership'][file_path] = 100.0 else: # 如果多个作者修改了文件,则按照每个作者的修改比例计算所有权 for author in authors: # 简化处理:平均分配所有权 ownership_percent = 100.0 / len(authors) if 'file_ownership' not in stats[author]: stats[author]['file_ownership'] = {} stats[author]['file_ownership'][file_path] = ownership_percent # 计算每个作者的连续提交天数 for author, commit_days in author_commit_days.items(): if not commit_days: continue # 按日期排序 sorted_days = sorted(commit_days) # 计算最长提交连续天数 current_streak = 1 max_streak = 1 for i in range(1, len(sorted_days)): # 如果当前日期与前一天相差正好一天,则增加连续计数 if (sorted_days[i] - sorted_days[i-1]).days == 1: current_streak += 1 else: # 重置当前连续计数 current_streak = 1 max_streak = max(max_streak, current_streak) # 记录最长连续提交天数 stats[author]['commit_streak'] = max_streak # 计算当前连续提交天数 (到最后一个日期) if sorted_days: today = datetime.now().date() days_since_last = (today - sorted_days[-1]).days if days_since_last <= 1: # 如果最后提交是今天或昨天 current_streak = 1 for i in range(len(sorted_days) - 1, 0, -1): if (sorted_days[i] - sorted_days[i-1]).days == 1: current_streak += 1 else: break stats[author]['current_streak'] = current_streak # 后处理:计算派生指标并转换集合为计数 for author, data in stats.items(): # 将文件集合转换为数量 data['files_count'] = len(data['files_modified']) data['active_days'] = len(data['commit_dates']) data['key_files_count'] = len(data['key_files_modified']) # 计算平均每次提交的修改量 if data['commits'] > 0: data['avg_commit_size'] = sum(data['commit_sizes']) / data['commits'] data['median_commit_size'] = statistics.median(data['commit_sizes']) if data['commit_sizes'] else 0 # 计算代码复杂度得分 (基于修改量、文件数和一致性) variability = statistics.stdev(data['commit_sizes']) if len(data['commit_sizes']) > 1 else 0 data['complexity_score'] = (data['avg_commit_size'] * data['files_count'] * (1 + variability / 1000)) / 100 # 计算一致性得分 (提交大小和频率的一致性) if variability > 0: data['consistency_score'] = 100 * (1 - min(1, variability / data['avg_commit_size'])) else: data['consistency_score'] = 100 else: data['avg_commit_size'] = 0 data['median_commit_size'] = 0 data['complexity_score'] = 0 data['consistency_score'] = 0 # 计算总修改量 data['total_changes'] = data['additions'] + data['deletions'] # 计算代码周转率 (code churn) - 估算值 if data['additions'] > 0 and data['deletions'] > 0: data['code_churn'] = min(data['additions'], data['deletions']) / max(data['additions'], data['deletions']) * 100 # 计算活跃时长(天) if data['first_commit'] and data['last_commit']: delta = data['last_commit'] - data['first_commit'] data['active_period_days'] = delta.days + 1 # 计算活跃密度 (提交数/活跃天数) if delta.days > 0: data['activity_density'] = data['commits'] / delta.days else: data['activity_density'] = data['commits'] else: data['active_period_days'] = 0 data['activity_density'] = 0 # 计算协作得分 (基于参与修改的共享文件比例) total_files = len(data['files_modified']) shared_files = sum(1 for f in data['files_modified'] if len(file_authors[f]) > 1) if total_files > 0: data['collaboration_score'] = (shared_files / total_files) * 100 # 计算专业领域 (基于文件类型和目录) if data['file_types']: primary_type = max(data['file_types'].items(), key=lambda x: x[1])[0] data['primary_file_type'] = primary_type data['primary_file_type_percent'] = (data['file_types'][primary_type] / sum(data['file_types'].values())) * 100 # 统计目录专业度 if data['directories_modified']: dir_counts = Counter() for directory in data['directories_modified']: dir_counts[directory] += 1 # 检查父目录 parent = os.path.dirname(directory) while parent: dir_counts[parent] += 0.5 # 对父目录给予较低的权重 parent = os.path.dirname(parent) # 找出专业领域(最常修改的目录) if dir_counts: primary_dir = max(dir_counts.items(), key=lambda x: x[1])[0] data['primary_directory'] = primary_dir data['expertise_areas'][primary_dir] = dir_counts[primary_dir] / sum(dir_counts.values()) return stats def print_stats(stats): """打印贡献者统计信息的详细报告""" # 基本信息表头 print("\n===== 贡献者基本统计 =====") print("{:<20} {:<10} {:<10} {:<10} {:<10} {:<15} {:<15}".format( "作者", "提交数", "添加行数", "删除行数", "总修改行数", "修改文件数", "活跃天数")) print("-" * 90) # 按总修改量排序 for author, data in sorted(stats.items(), key=lambda x: x[1]['total_changes'], reverse=True): print("{:<20} {:<10} {:<10} {:<10} {:<10} {:<15} {:<15}".format( author, data['commits'], data['additions'], data['deletions'], data['total_changes'], data['files_count'], data['active_days'] )) # 为每个贡献者打印详细信息 for author, data in sorted(stats.items(), key=lambda x: x[1]['total_changes'], reverse=True): print(f"\n\n===== {author} 的详细贡献统计 =====") # 活跃时间信息 if data['first_commit'] and data['last_commit']: print(f"首次提交时间: {data['first_commit'].strftime('%Y-%m-%d %H:%M:%S')}") print(f"最近提交时间: {data['last_commit'].strftime('%Y-%m-%d %H:%M:%S')}") print(f"活跃时长: {data['active_period_days']} 天") # 提交规模信息 print(f"平均每次提交修改: {data['avg_commit_size']:.2f} 行") print(f"最大单次提交修改: {data['largest_commit']} 行") # 文件类型分布 if data['file_types']: print("\n文件类型分布:") for ext, count in sorted(data['file_types'].items(), key=lambda x: x[1], reverse=True): print(f" {ext}: {count} 次修改") # 提交时间分布 if data['commit_hours']: print("\n提交时间分布:") for hour in range(24): count = data['commit_hours'].get(hour, 0) if count > 0: print(f" {hour:02d}:00-{hour+1:02d}:00: {count} 次提交") # 工作日分布 if data['commit_weekdays']: weekday_names = ["周一", "周二", "周三", "周四", "周五", "周六", "周日"] print("\n工作日分布:") for day in range(7): count = data['commit_weekdays'].get(day, 0) if count > 0: print(f" {weekday_names[day]}: {count} 次提交") def get_team_summary(stats): """生成团队整体统计摘要""" summary = { 'total_commits': 0, 'total_additions': 0, 'total_deletions': 0, 'total_files': set(), 'contributors': len(stats), 'first_commit': None, 'last_commit': None, } for author, data in stats.items(): summary['total_commits'] += data['commits'] summary['total_additions'] += data['additions'] summary['total_deletions'] += data['deletions'] summary['total_files'].update(data['files_modified']) # 更新首次和最近提交 if data['first_commit']: if summary['first_commit'] is None or data['first_commit'] < summary['first_commit']: summary['first_commit'] = data['first_commit'] if data['last_commit']: if summary['last_commit'] is None or data['last_commit'] > summary['last_commit']: summary['last_commit'] = data['last_commit'] return summary def print_team_summary(summary): """打印团队整体统计摘要""" print("\n===== 团队整体统计 =====") print(f"贡献者数量: {summary['contributors']}") print(f"总提交次数: {summary['total_commits']}") print(f"总添加行数: {summary['total_additions']}") print(f"总删除行数: {summary['total_deletions']}") print(f"总修改行数: {summary['total_additions'] + summary['total_deletions']}") print(f"修改的文件数: {len(summary['total_files'])}") if summary['first_commit'] and summary['last_commit']: print(f"项目起始时间: {summary['first_commit'].strftime('%Y-%m-%d')}") print(f"最近活动时间: {summary['last_commit'].strftime('%Y-%m-%d')}") delta = summary['last_commit'] - summary['first_commit'] print(f"项目活跃时长: {delta.days + 1} 天") if __name__ == "__main__": # 设置仓库路径(当前目录) repo_path = '.' # 设置日期范围(示例) # 注意:这里使用的是2025年的日期,可能需要根据实际情况调整 start_date = datetime(2025, 1, 1) # 修改为更合理的日期范围 end_date = datetime(2025, 12, 31) print(f"分析Git仓库: {os.path.abspath(repo_path)}") print(f"时间范围: {start_date.strftime('%Y-%m-%d')} 至 {end_date.strftime('%Y-%m-%d')}") # 获取统计信息 stats = get_contributor_stats(repo_path, start_date, end_date) # 打印团队摘要 team_summary = get_team_summary(stats) print_team_summary(team_summary) print(stats)