09-文件操作-读写Excel、Word、PDF
别再手动复制粘贴了!
你有没有过这样的经历:
- 需要从100个Excel表格里提取数据,复制粘贴到天荒地老
- 需要给1000个Word文档替换文字,打开一个改一个
- 需要把PDF表格里的数据提取出来,手动抄到Excel
别再假装你会手动完成这些事了,用Python吧!
文件操作是Python最实用的技能之一,让你几秒钟搞定几小时的工作。
文件的基本操作
读取文本文件
# 方法1:open() + close()
file = open("test.txt", "r", encoding="utf-8") # r表示读取
content = file.read()
print(content)
file.close()
# 方法2:with语句(推荐,自动关闭文件)
with open("test.txt", "r", encoding="utf-8") as file:
content = file.read()
print(content)
# 逐行读取
with open("test.txt", "r", encoding="utf-8") as file:
for line in file:
line = line.strip() # 去除换行符
print(line)
写入文本文件
# 写入文件(覆盖)
with open("output.txt", "w", encoding="utf-8") as file:
file.write("Hello, World!")
file.write("Hello, Python!")
# 写入文件(追加)
with open("output.txt", "a", encoding="utf-8") as file:
file.write("\n新的一行")
# 写入多行
lines = ["第一行", "第二行", "第三行"]
with open("output.txt", "w", encoding="utf-8") as file:
file.write("\n".join(lines))
文件操作模式
| 模式 | 说明 | 如果文件不存在 |
|---|---|---|
"r" |
只读 | 报错 |
"w" |
只写(覆盖) | 创建新文件 |
"a" |
追加 | 创建新文件 |
"r+" |
读写 | 报错 |
"w+" |
读写(覆盖) | 创建新文件 |
"a+" |
读写(追加) | 创建新文件 |
操作Excel
安装库
pip install openpyxl
读取Excel
from openpyxl import load_workbook
# 打开Excel文件
workbook = load_workbook("data.xlsx")
# 获取所有工作表名
print(workbook.sheetnames) # 输出:['Sheet1', 'Sheet2']
# 选择工作表
sheet = workbook["Sheet1"]
# 读取单个单元格
print(sheet["A1"].value) # 输出:A1的值
print(sheet.cell(row=1, column=1).value) # 同上
# 读取多个单元格(行)
for row in sheet.iter_rows(min_row=1, max_row=3, values_only=True):
print(row)
# 读取所有数据
for row in sheet.iter_rows(values_only=True):
print(row)
# 读取指定范围
for row in sheet["A1:C3"]:
for cell in row:
print(cell.value, end="\t")
print()
写入Excel
from openpyxl import Workbook
# 创建新的Excel文件
workbook = Workbook()
sheet = workbook.active
# 写入单个单元格
sheet["A1"] = "姓名"
sheet["B1"] = "年龄"
sheet["C1"] = "职业"
# 写入多个单元格
data = [
["小明", 25, "程序员"],
["小红", 23, "设计师"],
["小刚", 28, "产品经理"]
]
for row_data in data:
sheet.append(row_data)
# 保存文件
workbook.save("output.xlsx")
# 向现有Excel文件添加数据
from openpyxl import load_workbook
workbook = load_workbook("output.xlsx")
sheet = workbook.active
sheet.append(["小李", 30, "运营"])
workbook.save("output.xlsx")
实战:合并多个Excel文件
from openpyxl import load_workbook, Workbook
def merge_excel_files(file_paths, output_path):
"""合并多个Excel文件"""
# 创建新的Excel文件
merged_workbook = Workbook()
merged_sheet = merged_workbook.active
header_written = False
for file_path in file_paths:
# 读取每个Excel文件
workbook = load_workbook(file_path)
sheet = workbook.active
# 读取数据
for row in sheet.iter_rows(values_only=True):
if not header_written:
merged_sheet.append(row)
header_written = True
else:
# 跳过第一个文件后面的文件的表头
if row[0] != sheet[1].value:
merged_sheet.append(row)
workbook.close()
# 保存合并后的文件
merged_workbook.save(output_path)
print(f"合并完成,保存到 {output_path}")
# 使用示例
file_paths = ["data1.xlsx", "data2.xlsx", "data3.xlsx"]
merge_excel_files(file_paths, "merged.xlsx")
操作Word
安装库
pip install python-docx
读取Word
from docx import Document
# 打开Word文档
doc = Document("document.docx")
# 读取所有段落
for paragraph in doc.paragraphs:
print(paragraph.text)
# 读取所有表格
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
print(cell.text, end="\t")
print()
写入Word
from docx import Document
# 创建新的Word文档
doc = Document()
# 添加标题
doc.add_heading('这是一级标题', level=1)
doc.add_heading('这是二级标题', level=2)
# 添加段落
doc.add_paragraph('这是一个普通的段落。')
doc.add_paragraph('这是另一个段落,包含一些文字。')
# 添加格式化文本
paragraph = doc.add_paragraph()
run = paragraph.add_run('这是粗体文字')
run.bold = True
run = paragraph.add_run(',这是斜体文字')
run.italic = True
# 添加列表
doc.add_paragraph('列表项1', style='List Bullet')
doc.add_paragraph('列表项2', style='List Bullet')
doc.add_paragraph('列表项3', style='List Bullet')
# 添加表格
table = doc.add_table(rows=3, cols=3)
table.style = 'Table Grid'
# 填充表格
for i in range(3):
for j in range(3):
table.cell(i, j).text = f"单元格{i+1}-{j+1}"
# 保存文档
doc.save("output.docx")
实战:批量替换Word文档中的文字
from docx import Document
def replace_in_word(input_path, output_path, replacements):
"""批量替换Word文档中的文字"""
doc = Document(input_path)
# 替换段落中的文字
for paragraph in doc.paragraphs:
for old_text, new_text in replacements.items():
if old_text in paragraph.text:
paragraph.text = paragraph.text.replace(old_text, new_text)
# 替换表格中的文字
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
for old_text, new_text in replacements.items():
if old_text in paragraph.text:
paragraph.text = paragraph.text.replace(old_text, new_text)
# 保存文档
doc.save(output_path)
print(f"替换完成,保存到 {output_path}")
# 使用示例
replacements = {
"旧公司名": "新公司名",
"旧电话": "新电话",
"旧地址": "新地址"
}
replace_in_word("contract.docx", "contract_new.docx", replacements)
操作PDF
安装库
pip install PyPDF2
读取PDF
from PyPDF2 import PdfReader
# 打开PDF文件
reader = PdfReader("document.pdf")
# 获取页数
print(f"总页数:{len(reader.pages)}")
# 读取第一页
page = reader.pages[0]
print(page.extract_text())
# 读取所有页面
for i, page in enumerate(reader.pages, 1):
print(f"=== 第{i}页 ===")
print(page.extract_text())
合并PDF文件
from PyPDF2 import PdfMerger
def merge_pdfs(file_paths, output_path):
"""合并多个PDF文件"""
merger = PdfMerger()
for file_path in file_paths:
merger.append(file_path)
merger.write(output_path)
merger.close()
print(f"合并完成,保存到 {output_path}")
# 使用示例
file_paths = ["file1.pdf", "file2.pdf", "file3.pdf"]
merge_pdfs(file_paths, "merged.pdf")
拆分PDF文件
from PyPDF2 import PdfReader, PdfWriter
def split_pdf(input_path, start_page, end_page, output_path):
"""拆分PDF文件"""
reader = PdfReader(input_path)
writer = PdfWriter()
for page_num in range(start_page - 1, end_page): # 页码从0开始
writer.add_page(reader.pages[page_num])
with open(output_path, "wb") as output_file:
writer.write(output_file)
print(f"拆分完成,保存到 {output_path}")
# 使用示例:提取第1-3页
split_pdf("document.pdf", 1, 3, "pages_1_3.pdf")
实战小项目1:自动化处理Excel数据
from openpyxl import load_workbook, Workbook
def process_sales_data(input_path, output_path):
"""处理销售数据,添加统计信息"""
# 读取数据
workbook = load_workbook(input_path)
sheet = workbook.active
# 获取所有数据
data = []
for row in sheet.iter_rows(values_only=True):
data.append(row)
# 假设数据格式:[商品名, 单价, 数量]
# 添加计算列:金额 = 单价 * 数量
# 跳过表头
for i, row in enumerate(data[1:], start=2): # 从第2行开始
product_name = row[0]
price = row[1]
quantity = row[2]
amount = price * quantity
# 写入金额
sheet.cell(row=i, column=4, value=amount)
# 添加汇总信息
total_amount = sum(row[3] for row in data[1:])
last_row = len(data) + 1
sheet.cell(row=last_row, column=3, value="总金额:")
sheet.cell(row=last_row, column=4, value=total_amount)
# 保存文件
workbook.save(output_path)
print(f"处理完成,保存到 {output_path}")
# 使用示例
process_sales_data("sales_data.xlsx", "sales_data_processed.xlsx")
实战小项目2:批量生成Word合同
from docx import Document
def generate_contract(template_path, output_path, data):
"""根据模板生成合同"""
doc = Document(template_path)
# 替换占位符
for paragraph in doc.paragraphs:
for key, value in data.items():
placeholder = f"{{{key}}}"
if placeholder in paragraph.text:
paragraph.text = paragraph.text.replace(placeholder, str(value))
# 替换表格中的占位符
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for paragraph in cell.paragraphs:
for key, value in data.items():
placeholder = f"{{{key}}}"
if placeholder in paragraph.text:
paragraph.text = paragraph.text.replace(placeholder, str(value))
# 保存文档
doc.save(output_path)
print(f"合同生成完毕,保存到 {output_path}")
# 使用示例
contract_data = {
"甲方": "张三",
"乙方": "李四",
"合同金额": "100000",
"签订日期": "2026年2月3日",
"有效期": "1年"
}
generate_contract("contract_template.docx", "contract_signed.docx", contract_data)
常见坑及解决方案
坑1:忘记关闭文件
# 错误做法
file = open("test.txt", "r")
content = file.read()
# 忘记关闭文件
# 正确做法:用with语句
with open("test.txt", "r") as file:
content = file.read()
# 自动关闭文件
坑2:编码问题
# 错误做法:可能会报错
with open("test.txt", "r") as file:
content = file.read()
# 正确做法:指定编码
with open("test.txt", "r", encoding="utf-8") as file:
content = file.read()
坑3:Excel索引从1开始
from openpyxl import load_workbook
sheet = workbook.active
# 错误做法:索引从0开始
print(sheet[0][0]) # 报错!
# 正确做法:索引从1开始
print(sheet["A1"]) # 或 sheet.cell(row=1, column=1)
本章小结
- 文本文件:open()、with语句、read()、write()、逐行读取
- Excel操作:openpyxl库、读取、写入、合并、处理数据
- Word操作:python-docx库、读取、写入、批量替换
- PDF操作:PyPDF2库、读取、合并、拆分
- 实战项目:处理销售数据、批量生成合同
文件操作让你自动化处理各种文档,几秒钟搞定几小时的工作。下一章我们来实战,把学到的知识结合起来做完整的项目!
继续学下去,马上就能做实用项目了!