别再手动复制粘贴了!

你有没有过这样的经历:

  • 需要从100个Excel表格里提取数据,复制粘贴到天荒地老
  • 需要给1000个Word文档替换文字,打开一个改一个
  • 需要把PDF表格里的数据提取出来,手动抄到Excel

别再假装你会手动完成这些事了,用Python吧!

文件操作是Python最实用的技能之一,让你几秒钟搞定几小时的工作。

文件的基本操作

读取文本文件

# 方法1:open() + close()
file = open("test.txt", "r", encoding="utf-8")  # r表示读取
content = file.read()
print(content)
file.close()

# 方法2:with语句(推荐,自动关闭文件)
with open("test.txt", "r", encoding="utf-8") as file:
    content = file.read()
    print(content)

# 逐行读取
with open("test.txt", "r", encoding="utf-8") as file:
    for line in file:
        line = line.strip()  # 去除换行符
        print(line)

写入文本文件

# 写入文件(覆盖)
with open("output.txt", "w", encoding="utf-8") as file:
    file.write("Hello, World!")
    file.write("Hello, Python!")

# 写入文件(追加)
with open("output.txt", "a", encoding="utf-8") as file:
    file.write("\n新的一行")

# 写入多行
lines = ["第一行", "第二行", "第三行"]
with open("output.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(lines))

文件操作模式

模式 说明 如果文件不存在
"r" 只读 报错
"w" 只写(覆盖) 创建新文件
"a" 追加 创建新文件
"r+" 读写 报错
"w+" 读写(覆盖) 创建新文件
"a+" 读写(追加) 创建新文件

操作Excel

安装库

pip install openpyxl

读取Excel

from openpyxl import load_workbook

# 打开Excel文件
workbook = load_workbook("data.xlsx")

# 获取所有工作表名
print(workbook.sheetnames)  # 输出:['Sheet1', 'Sheet2']

# 选择工作表
sheet = workbook["Sheet1"]

# 读取单个单元格
print(sheet["A1"].value)  # 输出:A1的值
print(sheet.cell(row=1, column=1).value)  # 同上

# 读取多个单元格(行)
for row in sheet.iter_rows(min_row=1, max_row=3, values_only=True):
    print(row)

# 读取所有数据
for row in sheet.iter_rows(values_only=True):
    print(row)

# 读取指定范围
for row in sheet["A1:C3"]:
    for cell in row:
        print(cell.value, end="\t")
    print()

写入Excel

from openpyxl import Workbook

# 创建新的Excel文件
workbook = Workbook()
sheet = workbook.active

# 写入单个单元格
sheet["A1"] = "姓名"
sheet["B1"] = "年龄"
sheet["C1"] = "职业"

# 写入多个单元格
data = [
    ["小明", 25, "程序员"],
    ["小红", 23, "设计师"],
    ["小刚", 28, "产品经理"]
]

for row_data in data:
    sheet.append(row_data)

# 保存文件
workbook.save("output.xlsx")

# 向现有Excel文件添加数据
from openpyxl import load_workbook

workbook = load_workbook("output.xlsx")
sheet = workbook.active

sheet.append(["小李", 30, "运营"])
workbook.save("output.xlsx")

实战:合并多个Excel文件

from openpyxl import load_workbook, Workbook

def merge_excel_files(file_paths, output_path):
    """合并多个Excel文件"""
    # 创建新的Excel文件
    merged_workbook = Workbook()
    merged_sheet = merged_workbook.active
    header_written = False

    for file_path in file_paths:
        # 读取每个Excel文件
        workbook = load_workbook(file_path)
        sheet = workbook.active

        # 读取数据
        for row in sheet.iter_rows(values_only=True):
            if not header_written:
                merged_sheet.append(row)
                header_written = True
            else:
                # 跳过第一个文件后面的文件的表头
                if row[0] != sheet[1].value:
                    merged_sheet.append(row)

        workbook.close()

    # 保存合并后的文件
    merged_workbook.save(output_path)
    print(f"合并完成,保存到 {output_path}")

# 使用示例
file_paths = ["data1.xlsx", "data2.xlsx", "data3.xlsx"]
merge_excel_files(file_paths, "merged.xlsx")

操作Word

安装库

pip install python-docx

读取Word

from docx import Document

# 打开Word文档
doc = Document("document.docx")

# 读取所有段落
for paragraph in doc.paragraphs:
    print(paragraph.text)

# 读取所有表格
for table in doc.tables:
    for row in table.rows:
        for cell in row.cells:
            print(cell.text, end="\t")
        print()

写入Word

from docx import Document

# 创建新的Word文档
doc = Document()

# 添加标题
doc.add_heading('这是一级标题', level=1)
doc.add_heading('这是二级标题', level=2)

# 添加段落
doc.add_paragraph('这是一个普通的段落。')
doc.add_paragraph('这是另一个段落,包含一些文字。')

# 添加格式化文本
paragraph = doc.add_paragraph()
run = paragraph.add_run('这是粗体文字')
run.bold = True
run = paragraph.add_run(',这是斜体文字')
run.italic = True

# 添加列表
doc.add_paragraph('列表项1', style='List Bullet')
doc.add_paragraph('列表项2', style='List Bullet')
doc.add_paragraph('列表项3', style='List Bullet')

# 添加表格
table = doc.add_table(rows=3, cols=3)
table.style = 'Table Grid'

# 填充表格
for i in range(3):
    for j in range(3):
        table.cell(i, j).text = f"单元格{i+1}-{j+1}"

# 保存文档
doc.save("output.docx")

实战:批量替换Word文档中的文字

from docx import Document

def replace_in_word(input_path, output_path, replacements):
    """批量替换Word文档中的文字"""
    doc = Document(input_path)

    # 替换段落中的文字
    for paragraph in doc.paragraphs:
        for old_text, new_text in replacements.items():
            if old_text in paragraph.text:
                paragraph.text = paragraph.text.replace(old_text, new_text)

    # 替换表格中的文字
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    for old_text, new_text in replacements.items():
                        if old_text in paragraph.text:
                            paragraph.text = paragraph.text.replace(old_text, new_text)

    # 保存文档
    doc.save(output_path)
    print(f"替换完成,保存到 {output_path}")

# 使用示例
replacements = {
    "旧公司名": "新公司名",
    "旧电话": "新电话",
    "旧地址": "新地址"
}

replace_in_word("contract.docx", "contract_new.docx", replacements)

操作PDF

安装库

pip install PyPDF2

读取PDF

from PyPDF2 import PdfReader

# 打开PDF文件
reader = PdfReader("document.pdf")

# 获取页数
print(f"总页数:{len(reader.pages)}")

# 读取第一页
page = reader.pages[0]
print(page.extract_text())

# 读取所有页面
for i, page in enumerate(reader.pages, 1):
    print(f"=== 第{i}页 ===")
    print(page.extract_text())

合并PDF文件

from PyPDF2 import PdfMerger

def merge_pdfs(file_paths, output_path):
    """合并多个PDF文件"""
    merger = PdfMerger()

    for file_path in file_paths:
        merger.append(file_path)

    merger.write(output_path)
    merger.close()
    print(f"合并完成,保存到 {output_path}")

# 使用示例
file_paths = ["file1.pdf", "file2.pdf", "file3.pdf"]
merge_pdfs(file_paths, "merged.pdf")

拆分PDF文件

from PyPDF2 import PdfReader, PdfWriter

def split_pdf(input_path, start_page, end_page, output_path):
    """拆分PDF文件"""
    reader = PdfReader(input_path)
    writer = PdfWriter()

    for page_num in range(start_page - 1, end_page):  # 页码从0开始
        writer.add_page(reader.pages[page_num])

    with open(output_path, "wb") as output_file:
        writer.write(output_file)

    print(f"拆分完成,保存到 {output_path}")

# 使用示例:提取第1-3页
split_pdf("document.pdf", 1, 3, "pages_1_3.pdf")

实战小项目1:自动化处理Excel数据

from openpyxl import load_workbook, Workbook

def process_sales_data(input_path, output_path):
    """处理销售数据,添加统计信息"""
    # 读取数据
    workbook = load_workbook(input_path)
    sheet = workbook.active

    # 获取所有数据
    data = []
    for row in sheet.iter_rows(values_only=True):
        data.append(row)

    # 假设数据格式:[商品名, 单价, 数量]
    # 添加计算列:金额 = 单价 * 数量
    # 跳过表头
    for i, row in enumerate(data[1:], start=2):  # 从第2行开始
        product_name = row[0]
        price = row[1]
        quantity = row[2]
        amount = price * quantity

        # 写入金额
        sheet.cell(row=i, column=4, value=amount)

    # 添加汇总信息
    total_amount = sum(row[3] for row in data[1:])
    last_row = len(data) + 1
    sheet.cell(row=last_row, column=3, value="总金额:")
    sheet.cell(row=last_row, column=4, value=total_amount)

    # 保存文件
    workbook.save(output_path)
    print(f"处理完成,保存到 {output_path}")

# 使用示例
process_sales_data("sales_data.xlsx", "sales_data_processed.xlsx")

实战小项目2:批量生成Word合同

from docx import Document

def generate_contract(template_path, output_path, data):
    """根据模板生成合同"""
    doc = Document(template_path)

    # 替换占位符
    for paragraph in doc.paragraphs:
        for key, value in data.items():
            placeholder = f"{{{key}}}"
            if placeholder in paragraph.text:
                paragraph.text = paragraph.text.replace(placeholder, str(value))

    # 替换表格中的占位符
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                for paragraph in cell.paragraphs:
                    for key, value in data.items():
                        placeholder = f"{{{key}}}"
                        if placeholder in paragraph.text:
                            paragraph.text = paragraph.text.replace(placeholder, str(value))

    # 保存文档
    doc.save(output_path)
    print(f"合同生成完毕,保存到 {output_path}")

# 使用示例
contract_data = {
    "甲方": "张三",
    "乙方": "李四",
    "合同金额": "100000",
    "签订日期": "2026年2月3日",
    "有效期": "1年"
}

generate_contract("contract_template.docx", "contract_signed.docx", contract_data)

常见坑及解决方案

坑1:忘记关闭文件

# 错误做法
file = open("test.txt", "r")
content = file.read()
# 忘记关闭文件

# 正确做法:用with语句
with open("test.txt", "r") as file:
    content = file.read()
# 自动关闭文件

坑2:编码问题

# 错误做法:可能会报错
with open("test.txt", "r") as file:
    content = file.read()

# 正确做法:指定编码
with open("test.txt", "r", encoding="utf-8") as file:
    content = file.read()

坑3:Excel索引从1开始

from openpyxl import load_workbook

sheet = workbook.active

# 错误做法:索引从0开始
print(sheet[0][0])  # 报错!

# 正确做法:索引从1开始
print(sheet["A1"])  # 或 sheet.cell(row=1, column=1)

本章小结

  • 文本文件:open()、with语句、read()、write()、逐行读取
  • Excel操作:openpyxl库、读取、写入、合并、处理数据
  • Word操作:python-docx库、读取、写入、批量替换
  • PDF操作:PyPDF2库、读取、合并、拆分
  • 实战项目:处理销售数据、批量生成合同

文件操作让你自动化处理各种文档,几秒钟搞定几小时的工作。下一章我们来实战,把学到的知识结合起来做完整的项目!

继续学下去,马上就能做实用项目了!