11-实战项目2-批量处理Excel
每天节省2小时重复劳动!
你有没有过这样的经历:
- 需要从10个Excel表格里汇总数据,复制粘贴到天荒地老
- 需要给100个Excel表格添加统一的格式,打开一个改一个
- 需要根据条件筛选数据,手动筛选到崩溃
别再假装你会手动处理Excel了,用Python吧!
今天我们做一个Excel批量处理工具,几秒钟搞定几小时的工作。
项目目标
实现一个Excel批量处理工具:
- 批量合并多个Excel文件
- 批量格式化Excel(设置列宽、字体、颜色等)
- 批量添加公式和计算列
- 批量筛选数据
- 生成数据统计报告
准备工作
安装所需库:
pip install openpyxl pandas
工具1:批量合并Excel文件
import os
from openpyxl import load_workbook, Workbook
def merge_excel_files(file_paths, output_path, skip_rows=0):
"""
批量合并多个Excel文件
Args:
file_paths: Excel文件路径列表
output_path: 输出文件路径
skip_rows: 跳过每个文件的行数(用于跳过表头)
"""
# 创建新的Excel文件
merged_workbook = Workbook()
merged_sheet = merged_workbook.active
merged_sheet.title = "合并数据"
row_index = 1
for i, file_path in enumerate(file_paths):
print(f"正在处理第{i+1}/{len(file_paths)}个文件:{os.path.basename(file_path)}")
try:
# 读取Excel文件
workbook = load_workbook(file_path)
sheet = workbook.active
# 读取数据
for row in sheet.iter_rows(min_row=skip_rows + 1, values_only=True):
if row[0] is not None: # 跳过空行
# 写入数据
for col_index, cell_value in enumerate(row, 1):
merged_sheet.cell(row=row_index, column=col_index, value=cell_value)
row_index += 1
workbook.close()
print(f" - 成功读取 {sheet.max_row} 行数据")
except Exception as e:
print(f" - 读取失败:{e}")
# 保存文件
merged_workbook.save(output_path)
print(f"\n合并完成!共 {row_index - 1} 行数据")
print(f"保存到:{output_path}")
def merge_folder_excel_files(folder_path, output_path, pattern="*.xlsx"):
"""
合并文件夹下的所有Excel文件
Args:
folder_path: 文件夹路径
output_path: 输出文件路径
pattern: 文件匹配模式
"""
import glob
# 获取所有Excel文件
file_paths = glob.glob(os.path.join(folder_path, pattern))
if not file_paths:
print("没有找到Excel文件!")
return
print(f"找到 {len(file_paths)} 个Excel文件")
merge_excel_files(file_paths, output_path)
# 使用示例
if __name__ == "__main__":
# 合并指定文件
file_paths = [
"data/sales_1月.xlsx",
"data/sales_2月.xlsx",
"data/sales_3月.xlsx"
]
merge_excel_files(file_paths, "output/合并销售数据.xlsx", skip_rows=1)
# 合并文件夹下的所有Excel
# merge_folder_excel_files("data", "output/全部数据.xlsx")
工具2:批量格式化Excel
from openpyxl import load_workbook
from openpyxl.styles import Font, PatternFill, Alignment, Border, Side
from openpyxl.utils import get_column_letter
def format_excel(input_path, output_path):
"""
格式化Excel文件
功能:
- 设置标题行样式
- 自动调整列宽
- 添加边框
- 设置对齐方式
- 添加条件格式
"""
# 读取Excel文件
workbook = load_workbook(input_path)
sheet = workbook.active
# 定义样式
header_font = Font(name="微软雅黑", size=12, bold=True, color="FFFFFF")
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
header_alignment = Alignment(horizontal="center", vertical="center")
body_font = Font(name="微软雅黑", size=10)
body_alignment = Alignment(horizontal="center", vertical="center")
thin_border = Border(
left=Side(style='thin', color='000000'),
right=Side(style='thin', color='000000'),
top=Side(style='thin', color='000000'),
bottom=Side(style='thin', color='000000')
)
# 格式化标题行
for cell in sheet[1]:
cell.font = header_font
cell.fill = header_fill
cell.alignment = header_alignment
cell.border = thin_border
# 格式化数据行
for row in sheet.iter_rows(min_row=2):
for cell in row:
cell.font = body_font
cell.alignment = body_alignment
cell.border = thin_border
# 自动调整列宽
for col in range(1, sheet.max_column + 1):
max_length = 0
column = get_column_letter(col)
# 计算该列的最大宽度
for row in range(1, sheet.max_row + 1):
cell = sheet[f"{column}{row}"]
try:
if cell.value:
max_length = max(max_length, len(str(cell.value)))
except:
pass
# 设置列宽(留一些余量)
adjusted_width = min(max_length + 2, 50)
sheet.column_dimensions[column].width = adjusted_width
# 冻结首行
sheet.freeze_panes = "A2"
# 保存文件
workbook.save(output_path)
print(f"格式化完成!保存到:{output_path}")
# 使用示例
if __name__ == "__main__":
format_excel("data/sales_data.xlsx", "output/销售数据_已格式化.xlsx")
工具3:批量添加计算列
from openpyxl import load_workbook
def add_calculated_column(input_path, output_path, formulas):
"""
批量添加计算列
Args:
input_path: 输入文件路径
output_path: 输出文件路径
formulas: 计算列配置列表
[
{
"column": "D", # 列名
"header": "金额", # 表头名称
"formula": "=B*C" # 公式(使用列号)
}
]
"""
workbook = load_workbook(input_path)
sheet = workbook.active
for config in formulas:
column = config["column"]
header = config["header"]
formula = config["formula"]
# 添加表头
sheet[f"{column}1"] = header
# 添加公式
for row in range(2, sheet.max_row + 1):
cell = sheet[f"{column}{row}"]
cell.value = formula
print(f"已添加列【{column}】:{header}")
# 保存文件
workbook.save(output_path)
print(f"计算列添加完成!保存到:{output_path}")
# 使用示例
if __name__ == "__main__":
formulas = [
{
"column": "D",
"header": "金额",
"formula": "=B2*C2"
},
{
"column": "E",
"header": "税率",
"formula": "=D2*0.1"
}
]
# 注意:公式需要根据实际行号调整
# 这里简化处理,实际应用中需要动态生成公式
add_calculated_column(
"data/product_list.xlsx",
"output/product_list_计算.xlsx",
formulas
)
工具4:批量筛选数据
from openpyxl import load_workbook, Workbook
def filter_excel(input_path, output_path, filters):
"""
批量筛选Excel数据
Args:
input_path: 输入文件路径
output_path: 输出文件路径
filters: 筛选条件列表
[
{
"column": "C", # 列名
"operator": ">", # 操作符:>, <, =, >=, <=, !=
"value": 100 # 值
}
]
"""
# 读取数据
workbook = load_workbook(input_path)
sheet = workbook.active
# 读取表头
headers = [cell.value for cell in sheet[1]]
# 筛选数据
filtered_rows = []
for row in sheet.iter_rows(min_row=2, values_only=True):
# 默认包含该行
include = True
# 检查所有筛选条件
for filter_config in filters:
column = filter_config["column"]
operator = filter_config["operator"]
value = filter_config["value"]
# 获取列号
col_index = ord(column.upper()) - ord("A")
# 获取单元格值
cell_value = row[col_index]
# 执行筛选
try:
cell_value = float(cell_value) if cell_value is not None else 0
value = float(value)
if operator == ">" and not (cell_value > value):
include = False
break
elif operator == "<" and not (cell_value < value):
include = False
break
elif operator == "=" and not (cell_value == value):
include = False
break
elif operator == ">=" and not (cell_value >= value):
include = False
break
elif operator == "<=" and not (cell_value <= value):
include = False
break
elif operator == "!=" and not (cell_value != value):
include = False
break
except:
include = False
break
if include:
filtered_rows.append(row)
# 创建新文件
filtered_workbook = Workbook()
filtered_sheet = filtered_workbook.active
# 写入表头
for col, header in enumerate(headers, 1):
filtered_sheet.cell(row=1, column=col, value=header)
# 写入筛选后的数据
for row_index, row_data in enumerate(filtered_rows, 2):
for col, value in enumerate(row_data, 1):
filtered_sheet.cell(row=row_index, column=col, value=value)
# 保存文件
filtered_workbook.save(output_path)
print(f"筛选完成!保留 {len(filtered_rows)} 行数据")
print(f"保存到:{output_path}")
# 使用示例
if __name__ == "__main__":
filters = [
{
"column": "D", # D列(金额)
"operator": ">",
"value": 1000 # 金额大于1000
}
]
filter_excel(
"data/sales_data.xlsx",
"output/销售数据_筛选.xlsx",
filters
)
工具5:生成数据统计报告
from openpyxl import load_workbook, Workbook
from openpyxl.styles import Font, PatternFill, Alignment
import statistics
def generate_statistics(input_path, output_path):
"""生成数据统计报告"""
# 读取数据
workbook = load_workbook(input_path)
sheet = workbook.active
# 读取所有数据
data = list(sheet.iter_rows(values_only=True))
headers = data[0] # 第一行是表头
rows = data[1:] # 其他是数据
# 创建报告
report_workbook = Workbook()
report_sheet = report_workbook.active
report_sheet.title = "统计报告"
# 设置标题样式
title_font = Font(name="微软雅黑", size=16, bold=True)
header_font = Font(name="微软雅黑", size=12, bold=True)
header_fill = PatternFill(start_color="4472C4", end_color="4472C4", fill_type="solid")
header_alignment = Alignment(horizontal="center", vertical="center")
# 写入标题
report_sheet["A1"] = "数据统计报告"
report_sheet["A1"].font = title_font
report_sheet.merge_cells("A1:E1")
# 写入基本信息
report_sheet["A3"] = "统计时间:"
report_sheet["B3"] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
report_sheet["A4"] = "总行数:"
report_sheet["B4"] = len(rows)
# 写入各列统计信息
row_index = 6
report_sheet["A6"] = "列名"
report_sheet["B6"] = "数据类型"
report_sheet["C6"] = "非空数量"
report_sheet["D6"] = "最小值"
report_sheet["E6"] = "最大值"
report_sheet["F6"] = "平均值"
# 设置表头样式
for col in ["A", "B", "C", "D", "E", "F"]:
cell = report_sheet[f"{col}6"]
cell.font = header_font
cell.fill = header_fill
cell.alignment = header_alignment
# 统计每一列
for col_index, header in enumerate(headers, 1):
col_letter = chr(ord("A") + col_index - 1)
# 获取该列所有数据
col_data = [row[col_index - 1] for row in rows]
# 统计非空数量
non_null_count = sum(1 for value in col_data if value is not None)
# 判断数据类型
sample = col_data[0] if col_data else None
if isinstance(sample, (int, float)):
data_type = "数字"
min_value = min(col_data) if col_data else ""
max_value = max(col_data) if col_data else ""
avg_value = statistics.mean(col_data) if col_data else ""
else:
data_type = "文本"
min_value = ""
max_value = ""
avg_value = ""
# 写入统计信息
row_index += 1
report_sheet[f"A{row_index}"] = header
report_sheet[f"B{row_index}"] = data_type
report_sheet[f"C{row_index}"] = non_null_count
report_sheet[f"D{row_index}"] = min_value
report_sheet[f"E{row_index}"] = max_value
report_sheet[f"F{row_index}"] = round(avg_value, 2) if avg_value else ""
# 保存报告
report_workbook.save(output_path)
print(f"统计报告生成完成!保存到:{output_path}")
# 使用示例
if __name__ == "__main__":
import datetime
generate_statistics(
"data/sales_data.xlsx",
"output/统计报告.xlsx"
)
完整工具集
class ExcelProcessor:
"""Excel批量处理工具集"""
def __init__(self):
pass
def merge_files(self, file_paths, output_path, skip_rows=0):
"""合并多个Excel文件"""
merge_excel_files(file_paths, output_path, skip_rows)
def format_file(self, input_path, output_path):
"""格式化Excel文件"""
format_excel(input_path, output_path)
def add_calculations(self, input_path, output_path, formulas):
"""添加计算列"""
add_calculated_column(input_path, output_path, formulas)
def filter_data(self, input_path, output_path, filters):
"""筛选数据"""
filter_excel(input_path, output_path, filters)
def generate_report(self, input_path, output_path):
"""生成统计报告"""
generate_statistics(input_path, output_path)
# 使用示例
if __name__ == "__main__":
processor = ExcelProcessor()
# 一键完成所有操作
input_file = "data/sales_data.xlsx"
# 1. 格式化
processor.format_file(input_file, "output/1_已格式化.xlsx")
# 2. 筛选
filters = [{"column": "D", "operator": ">", "value": 1000}]
processor.filter_data("output/1_已格式化.xlsx", "output/2_已筛选.xlsx", filters)
# 3. 生成报告
processor.generate_report("output/2_已筛选.xlsx", "output/3_统计报告.xlsx")
print("所有操作完成!")
本章小结
- 批量合并Excel:自动合并多个Excel文件
- 批量格式化:设置列宽、字体、颜色、边框
- 添加计算列:自动添加公式和计算字段
- 批量筛选:根据条件筛选数据
- 生成报告:自动生成数据统计报告
这个工具可以帮你每天节省2小时处理Excel的时间!下一章我们来学习自动发送邮件,装X利器!
继续学下去,马上就能做实用项目了!