Skip to content

计算优化

优化计算性能可以大幅提高数据处理效率。

使用向量化操作

python
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'A': np.random.randn(100000),
    'B': np.random.randn(100000)
})

# 不好的做法:使用循环
import time
start = time.time()
result_loop = []
for i in range(len(df)):
    result_loop.append(df.iloc[i]['A'] + df.iloc[i]['B'])
print(f"循环耗时: {time.time() - start:.4f}秒")

# 好的做法:向量化操作
start = time.time()
result_vector = df['A'] + df['B']
print(f"向量化耗时: {time.time() - start:.4f}秒")

使用apply优化

python
# 不好的做法:逐行apply
def slow_func(row):
    return row['A'] * row['B']

start = time.time()
result = df.apply(slow_func, axis=1)
print(f"apply耗时: {time.time() - start:.4f}秒")

# 好的做法:使用向量化
def fast_func(df):
    return df['A'] * df['B']

start = time.time()
result = fast_func(df)
print(f"向量化耗时: {time.time() - start:.4f}秒")

使用eval和query

python
# 使用eval进行复杂表达式计算
start = time.time()
result = df.eval('A + B * 2')
print(f"eval耗时: {time.time() - start:.4f}秒")

# 使用query进行筛选
start = time.time()
result = df.query('A > 0 and B > 0')
print(f"query耗时: {time.time() - start:.4f}秒")

使用NumPy函数

python
# Pandas操作
start = time.time()
result = df['A'].mean()
print(f"Pandas mean耗时: {time.time() - start:.6f}秒")

# NumPy操作
start = time.time()
result = np.mean(df['A'].values)
print(f"NumPy mean耗时: {time.time() - start:.6f}秒")

使用Categorical优化分组

python
df = pd.DataFrame({
    '类别': np.random.choice(['A', 'B', 'C', 'D'], 1000000),
    '值': np.random.randn(1000000)
})

# 字符串分组
start = time.time()
result = df.groupby('类别')['值'].mean()
print(f"字符串分组耗时: {time.time() - start:.4f}秒")

# 转换为类别后分组
df['类别'] = df['类别'].astype('category')
start = time.time()
result = df.groupby('类别')['值'].mean()
print(f"类别分组耗时: {time.time() - start:.4f}秒")

使用numba加速

python
try:
    from numba import jit
    
    # 使用numba加速的函数
    @jit(nopython=True)
    def fast_calculation(arr):
        result = 0.0
        for i in range(len(arr)):
            result += arr[i] ** 2
        return result
    
    # 比较性能
    arr = np.random.randn(1000000)
    
    start = time.time()
    result_python = sum(x**2 for x in arr)
    print(f"Python耗时: {time.time() - start:.4f}秒")
    
    start = time.time()
    result_numba = fast_calculation(arr)
    print(f"Numba耗时: {time.time() - start:.4f}秒")
    
except ImportError:
    print("请安装numba: pip install numba")

使用多进程

python
from multiprocessing import Pool

def process_chunk(chunk):
    return chunk.groupby('类别')['值'].sum()

# 分块处理
chunks = np.array_split(df, 4)

# 单进程
start = time.time()
result_single = df.groupby('类别')['值'].sum()
print(f"单进程耗时: {time.time() - start:.4f}秒")

# 多进程
start = time.time()
with Pool(4) as pool:
    results = pool.map(process_chunk, chunks)
result_multi = pd.concat(results).groupby(level=0).sum()
print(f"多进程耗时: {time.time() - start:.4f}秒")

避免链式赋值

python
# 不好的做法:链式赋值(产生警告且慢)
# df[df['A'] > 0]['B'] = 1  # 不推荐

# 好的做法:使用loc
df.loc[df['A'] > 0, 'B'] = 1  # 推荐

缓存中间结果

python
# 不好的做法:重复计算
result = df[df['A'] > 0]['B'].sum() + df[df['A'] > 0]['C'].sum()

# 好的做法:缓存筛选结果
filtered = df[df['A'] > 0]
result = filtered['B'].sum() + filtered['C'].sum()

计算优化是提升数据处理效率的关键,选择合适的优化方法可以显著减少运行时间。