staff_data/scripts/git_stats.py

503 lines
22 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import statistics
from git import Repo
from collections import defaultdict, Counter
from datetime import datetime, timedelta
import os
def get_contributor_stats(repo_path, start_date=None, end_date=None, branch='HEAD'):
"""
获取仓库贡献者的详细统计信息
Args:
repo_path: Git仓库路径
start_date: 开始日期(可选)
end_date: 结束日期(可选)
branch: 要分析的分支默认为HEAD
Returns:
包含每个贡献者详细统计信息的字典
"""
# 初始化仓库对象
repo = Repo(repo_path)
# 存储统计结果
stats = defaultdict(lambda: {
'additions': 0, # 添加的行数
'deletions': 0, # 删除的行数
'commits': 0, # 提交次数
'files_modified': set(), # 修改过的文件集合
'file_types': defaultdict(int),# 各类型文件的修改次数
'commit_dates': set(), # 提交日期集合
'commit_hours': defaultdict(int), # 提交小时分布
'commit_weekdays': defaultdict(int), # 提交工作日分布
'largest_commit': 0, # 最大单次提交修改量
'first_commit': None, # 首次提交时间
'last_commit': None, # 最近提交时间
'commit_sizes': [], # 每次提交的大小,用于计算平均值和中位数
'commit_messages': [], # 提交消息列表
'commit_message_lengths': [], # 提交消息长度列表
'directories_modified': set(), # 修改过的目录集合
'co_authors': set(), # 合作者集合
'impact_score': 0, # 影响力得分
'complexity_score': 0, # 复杂度得分
'commit_by_month': defaultdict(int), # 按月份统计的提交次数
'commit_by_quarter': defaultdict(int), # 按季度统计的提交次数
'commit_by_year': defaultdict(int), # 按年份统计的提交次数
'commit_by_week': defaultdict(int), # 按周统计的提交次数
'file_operations': { # 文件操作统计
'created': set(), # 创建的文件
'deleted': set(), # 删除的文件
'modified': set(), # 修改的文件
},
'review_comments': 0, # 代码审查评论数(如果可用)
'merge_commits': 0, # 合并提交数
'commit_streak': 0, # 最长连续提交天数
'current_streak': 0, # 当前连续提交天数
'contribution_days': [], # 所有贡献的日期列表(用于热图)
'code_churn': 0, # 代码周转率(添加后又删除的代码)
'file_ownership': {}, # 文件所有权百分比
'key_files_modified': set(), # 修改过的关键文件
'refactoring_commits': 0, # 重构提交数(基于提交消息分析)
'bug_fix_commits': 0, # 修复bug的提交数
'feature_commits': 0, # 新功能提交数
'documentation_commits': 0, # 文档相关提交数
'commit_size_distribution': defaultdict(int), # 提交大小分布
'collaboration_score': 0, # 协作得分
'consistency_score': 0, # 一致性得分
'expertise_areas': defaultdict(float), # 专业领域(目录/语言)
})
# 存储所有文件的修改者,用于计算协作指标
file_authors = defaultdict(set)
# 存储项目文件的重要性权重 (基于修改频率)
file_importance = Counter()
# 存储用于检测关键词的正则表达式
import re
refactor_pattern = re.compile(r'refactor|重构', re.IGNORECASE)
bugfix_pattern = re.compile(r'fix|修复|bug|问题|issue|错误', re.IGNORECASE)
feature_pattern = re.compile(r'feature|功能|新增|add|实现', re.IGNORECASE)
docs_pattern = re.compile(r'doc|文档|注释|comment', re.IGNORECASE)
# 记录每位贡献者的提交日期,用于计算连续贡献天数
author_commit_days = defaultdict(set)
# 定义关键文件路径模式 (可以根据项目自定义)
key_file_patterns = [
re.compile(r'package\.json$'),
re.compile(r'docker-compose\.yml$'),
re.compile(r'Dockerfile$'),
re.compile(r'tsconfig\..*\.json$'),
re.compile(r'/src/index\.[jt]s$'),
re.compile(r'README\.md$'),
re.compile(r'\.env'),
re.compile(r'/main\.[jt]s$'),
re.compile(r'/app\.[jt]s$'),
]
# 遍历所有提交
for commit in repo.iter_commits(branch):
# 过滤日期
commit_date = datetime.fromtimestamp(commit.committed_date)
if start_date and commit_date < start_date:
continue
if end_date and commit_date > end_date:
continue
author = commit.author.name
stats[author]['commits'] += 1
# 记录提交日期和时间
commit_day = commit_date.date()
stats[author]['commit_dates'].add(commit_day)
stats[author]['contribution_days'].append(commit_day) # 用于热图
stats[author]['commit_hours'][commit_date.hour] += 1
stats[author]['commit_weekdays'][commit_date.weekday()] += 1
# 添加到作者的提交日集合
author_commit_days[author].add(commit_day)
# 按时间段统计
year = commit_date.year
month = commit_date.month
quarter = (month - 1) // 3 + 1
week_num = commit_date.isocalendar()[1]
stats[author]['commit_by_year'][year] += 1
stats[author]['commit_by_month'][f"{year}-{month:02d}"] += 1
stats[author]['commit_by_quarter'][f"{year}-Q{quarter}"] += 1
stats[author]['commit_by_week'][f"{year}-W{week_num:02d}"] += 1
# 分析提交消息,对提交进行分类
commit_message = commit.message.strip()
if refactor_pattern.search(commit_message):
stats[author]['refactoring_commits'] += 1
if bugfix_pattern.search(commit_message):
stats[author]['bug_fix_commits'] += 1
if feature_pattern.search(commit_message):
stats[author]['feature_commits'] += 1
if docs_pattern.search(commit_message):
stats[author]['documentation_commits'] += 1
# 记录首次和最近提交
if stats[author]['first_commit'] is None or commit_date < stats[author]['first_commit']:
stats[author]['first_commit'] = commit_date
if stats[author]['last_commit'] is None or commit_date > stats[author]['last_commit']:
stats[author]['last_commit'] = commit_date
# 记录提交消息
commit_message = commit.message.strip()
stats[author]['commit_messages'].append(commit_message)
stats[author]['commit_message_lengths'].append(len(commit_message))
# 检测是否为合并提交
if len(commit.parents) > 1:
stats[author]['merge_commits'] += 1
# 统计添加和删除的行数
total_changes = 0
modified_files = set()
created_files = set()
deleted_files = set()
directories = set()
# 尝试获取提交前后的差异,以确定文件操作类型
try:
if commit.parents:
parent = commit.parents[0]
diffs = parent.diff(commit)
for diff_item in diffs:
if diff_item.new_file:
if diff_item.b_path:
created_files.add(diff_item.b_path)
elif diff_item.deleted_file:
if diff_item.a_path:
deleted_files.add(diff_item.a_path)
else:
if diff_item.a_path:
modified_files.add(diff_item.a_path)
else:
# 对于首次提交,所有文件都是新创建的
for file_path in commit.stats.files:
created_files.add(file_path)
except Exception as e:
# 如果获取差异失败,退回到简单的文件修改统计
modified_files = set(commit.stats.files.keys())
for file_path, item in commit.stats.files.items():
# 统计文件类型
_, ext = os.path.splitext(file_path)
if ext: # 确保扩展名不为空
stats[author]['file_types'][ext] += 1
else:
stats[author]['file_types']['no_extension'] += 1
# 记录目录
directory = os.path.dirname(file_path)
if directory:
directories.add(directory)
# 记录修改的文件
modified_files.add(file_path)
# 记录文件的修改者,用于计算协作指标
file_authors[file_path].add(author)
# 统计添加和删除的行数
stats[author]['additions'] += item['insertions']
stats[author]['deletions'] += item['deletions']
total_changes += item['insertions'] + item['deletions']
# 更新修改过的文件和目录集合
stats[author]['files_modified'].update(modified_files)
stats[author]['directories_modified'].update(directories)
stats[author]['file_operations']['created'].update(created_files)
stats[author]['file_operations']['deleted'].update(deleted_files)
stats[author]['file_operations']['modified'].update(modified_files - created_files - deleted_files)
# 记录本次提交的修改量
stats[author]['commit_sizes'].append(total_changes)
# 记录提交大小分布
commit_size_category = "小型(1-10行)" if total_changes <= 10 else \
"中型(11-100行)" if total_changes <= 100 else \
"大型(101-500行)" if total_changes <= 500 else \
"超大型(500+行)"
stats[author]['commit_size_distribution'][commit_size_category] += 1
# 更新最大单次提交修改量
if total_changes > stats[author]['largest_commit']:
stats[author]['largest_commit'] = total_changes
# 检查修改的文件是否为关键文件
for file_path in modified_files:
for pattern in key_file_patterns:
if pattern.search(file_path):
stats[author]['key_files_modified'].add(file_path)
break
# 更新文件重要性权重
for file_path in modified_files:
file_importance[file_path] += 1
# 计算影响力得分 (基于修改的文件数和总修改行数)
impact = total_changes * len(modified_files) / 100 if modified_files else 0
stats[author]['impact_score'] += impact
# 计算文件协作度和文件所有权
for file_path, authors in file_authors.items():
# 如果只有一个作者修改了文件则该作者100%拥有此文件
if len(authors) == 1:
author = next(iter(authors))
if 'file_ownership' not in stats[author]:
stats[author]['file_ownership'] = {}
stats[author]['file_ownership'][file_path] = 100.0
else:
# 如果多个作者修改了文件,则按照每个作者的修改比例计算所有权
for author in authors:
# 简化处理:平均分配所有权
ownership_percent = 100.0 / len(authors)
if 'file_ownership' not in stats[author]:
stats[author]['file_ownership'] = {}
stats[author]['file_ownership'][file_path] = ownership_percent
# 计算每个作者的连续提交天数
for author, commit_days in author_commit_days.items():
if not commit_days:
continue
# 按日期排序
sorted_days = sorted(commit_days)
# 计算最长提交连续天数
current_streak = 1
max_streak = 1
for i in range(1, len(sorted_days)):
# 如果当前日期与前一天相差正好一天,则增加连续计数
if (sorted_days[i] - sorted_days[i-1]).days == 1:
current_streak += 1
else:
# 重置当前连续计数
current_streak = 1
max_streak = max(max_streak, current_streak)
# 记录最长连续提交天数
stats[author]['commit_streak'] = max_streak
# 计算当前连续提交天数 (到最后一个日期)
if sorted_days:
today = datetime.now().date()
days_since_last = (today - sorted_days[-1]).days
if days_since_last <= 1: # 如果最后提交是今天或昨天
current_streak = 1
for i in range(len(sorted_days) - 1, 0, -1):
if (sorted_days[i] - sorted_days[i-1]).days == 1:
current_streak += 1
else:
break
stats[author]['current_streak'] = current_streak
# 后处理:计算派生指标并转换集合为计数
for author, data in stats.items():
# 将文件集合转换为数量
data['files_count'] = len(data['files_modified'])
data['active_days'] = len(data['commit_dates'])
data['key_files_count'] = len(data['key_files_modified'])
# 计算平均每次提交的修改量
if data['commits'] > 0:
data['avg_commit_size'] = sum(data['commit_sizes']) / data['commits']
data['median_commit_size'] = statistics.median(data['commit_sizes']) if data['commit_sizes'] else 0
# 计算代码复杂度得分 (基于修改量、文件数和一致性)
variability = statistics.stdev(data['commit_sizes']) if len(data['commit_sizes']) > 1 else 0
data['complexity_score'] = (data['avg_commit_size'] * data['files_count'] * (1 + variability / 1000)) / 100
# 计算一致性得分 (提交大小和频率的一致性)
if variability > 0:
data['consistency_score'] = 100 * (1 - min(1, variability / data['avg_commit_size']))
else:
data['consistency_score'] = 100
else:
data['avg_commit_size'] = 0
data['median_commit_size'] = 0
data['complexity_score'] = 0
data['consistency_score'] = 0
# 计算总修改量
data['total_changes'] = data['additions'] + data['deletions']
# 计算代码周转率 (code churn) - 估算值
if data['additions'] > 0 and data['deletions'] > 0:
data['code_churn'] = min(data['additions'], data['deletions']) / max(data['additions'], data['deletions']) * 100
# 计算活跃时长(天)
if data['first_commit'] and data['last_commit']:
delta = data['last_commit'] - data['first_commit']
data['active_period_days'] = delta.days + 1
# 计算活跃密度 (提交数/活跃天数)
if delta.days > 0:
data['activity_density'] = data['commits'] / delta.days
else:
data['activity_density'] = data['commits']
else:
data['active_period_days'] = 0
data['activity_density'] = 0
# 计算协作得分 (基于参与修改的共享文件比例)
total_files = len(data['files_modified'])
shared_files = sum(1 for f in data['files_modified'] if len(file_authors[f]) > 1)
if total_files > 0:
data['collaboration_score'] = (shared_files / total_files) * 100
# 计算专业领域 (基于文件类型和目录)
if data['file_types']:
primary_type = max(data['file_types'].items(), key=lambda x: x[1])[0]
data['primary_file_type'] = primary_type
data['primary_file_type_percent'] = (data['file_types'][primary_type] / sum(data['file_types'].values())) * 100
# 统计目录专业度
if data['directories_modified']:
dir_counts = Counter()
for directory in data['directories_modified']:
dir_counts[directory] += 1
# 检查父目录
parent = os.path.dirname(directory)
while parent:
dir_counts[parent] += 0.5 # 对父目录给予较低的权重
parent = os.path.dirname(parent)
# 找出专业领域(最常修改的目录)
if dir_counts:
primary_dir = max(dir_counts.items(), key=lambda x: x[1])[0]
data['primary_directory'] = primary_dir
data['expertise_areas'][primary_dir] = dir_counts[primary_dir] / sum(dir_counts.values())
return stats
def print_stats(stats):
"""打印贡献者统计信息的详细报告"""
# 基本信息表头
print("\n===== 贡献者基本统计 =====")
print("{:<20} {:<10} {:<10} {:<10} {:<10} {:<15} {:<15}".format(
"作者", "提交数", "添加行数", "删除行数", "总修改行数", "修改文件数", "活跃天数"))
print("-" * 90)
# 按总修改量排序
for author, data in sorted(stats.items(), key=lambda x: x[1]['total_changes'], reverse=True):
print("{:<20} {:<10} {:<10} {:<10} {:<10} {:<15} {:<15}".format(
author,
data['commits'],
data['additions'],
data['deletions'],
data['total_changes'],
data['files_count'],
data['active_days']
))
# 为每个贡献者打印详细信息
for author, data in sorted(stats.items(), key=lambda x: x[1]['total_changes'], reverse=True):
print(f"\n\n===== {author} 的详细贡献统计 =====")
# 活跃时间信息
if data['first_commit'] and data['last_commit']:
print(f"首次提交时间: {data['first_commit'].strftime('%Y-%m-%d %H:%M:%S')}")
print(f"最近提交时间: {data['last_commit'].strftime('%Y-%m-%d %H:%M:%S')}")
print(f"活跃时长: {data['active_period_days']}")
# 提交规模信息
print(f"平均每次提交修改: {data['avg_commit_size']:.2f}")
print(f"最大单次提交修改: {data['largest_commit']}")
# 文件类型分布
if data['file_types']:
print("\n文件类型分布:")
for ext, count in sorted(data['file_types'].items(), key=lambda x: x[1], reverse=True):
print(f" {ext}: {count} 次修改")
# 提交时间分布
if data['commit_hours']:
print("\n提交时间分布:")
for hour in range(24):
count = data['commit_hours'].get(hour, 0)
if count > 0:
print(f" {hour:02d}:00-{hour+1:02d}:00: {count} 次提交")
# 工作日分布
if data['commit_weekdays']:
weekday_names = ["周一", "周二", "周三", "周四", "周五", "周六", "周日"]
print("\n工作日分布:")
for day in range(7):
count = data['commit_weekdays'].get(day, 0)
if count > 0:
print(f" {weekday_names[day]}: {count} 次提交")
def get_team_summary(stats):
"""生成团队整体统计摘要"""
summary = {
'total_commits': 0,
'total_additions': 0,
'total_deletions': 0,
'total_files': set(),
'contributors': len(stats),
'first_commit': None,
'last_commit': None,
}
for author, data in stats.items():
summary['total_commits'] += data['commits']
summary['total_additions'] += data['additions']
summary['total_deletions'] += data['deletions']
summary['total_files'].update(data['files_modified'])
# 更新首次和最近提交
if data['first_commit']:
if summary['first_commit'] is None or data['first_commit'] < summary['first_commit']:
summary['first_commit'] = data['first_commit']
if data['last_commit']:
if summary['last_commit'] is None or data['last_commit'] > summary['last_commit']:
summary['last_commit'] = data['last_commit']
return summary
def print_team_summary(summary):
"""打印团队整体统计摘要"""
print("\n===== 团队整体统计 =====")
print(f"贡献者数量: {summary['contributors']}")
print(f"总提交次数: {summary['total_commits']}")
print(f"总添加行数: {summary['total_additions']}")
print(f"总删除行数: {summary['total_deletions']}")
print(f"总修改行数: {summary['total_additions'] + summary['total_deletions']}")
print(f"修改的文件数: {len(summary['total_files'])}")
if summary['first_commit'] and summary['last_commit']:
print(f"项目起始时间: {summary['first_commit'].strftime('%Y-%m-%d')}")
print(f"最近活动时间: {summary['last_commit'].strftime('%Y-%m-%d')}")
delta = summary['last_commit'] - summary['first_commit']
print(f"项目活跃时长: {delta.days + 1}")
if __name__ == "__main__":
# 设置仓库路径(当前目录)
repo_path = '.'
# 设置日期范围(示例)
# 注意这里使用的是2025年的日期可能需要根据实际情况调整
start_date = datetime(2025, 1, 1) # 修改为更合理的日期范围
end_date = datetime(2025, 12, 31)
print(f"分析Git仓库: {os.path.abspath(repo_path)}")
print(f"时间范围: {start_date.strftime('%Y-%m-%d')}{end_date.strftime('%Y-%m-%d')}")
# 获取统计信息
stats = get_contributor_stats(repo_path, start_date, end_date)
# 打印团队摘要
team_summary = get_team_summary(stats)
print_team_summary(team_summary)
print(stats)