Appearance
计算优化
优化计算性能可以大幅提高数据处理效率。
使用向量化操作
python
import pandas as pd
import numpy as np
df = pd.DataFrame({
'A': np.random.randn(100000),
'B': np.random.randn(100000)
})
# 不好的做法:使用循环
import time
start = time.time()
result_loop = []
for i in range(len(df)):
result_loop.append(df.iloc[i]['A'] + df.iloc[i]['B'])
print(f"循环耗时: {time.time() - start:.4f}秒")
# 好的做法:向量化操作
start = time.time()
result_vector = df['A'] + df['B']
print(f"向量化耗时: {time.time() - start:.4f}秒")使用apply优化
python
# 不好的做法:逐行apply
def slow_func(row):
return row['A'] * row['B']
start = time.time()
result = df.apply(slow_func, axis=1)
print(f"apply耗时: {time.time() - start:.4f}秒")
# 好的做法:使用向量化
def fast_func(df):
return df['A'] * df['B']
start = time.time()
result = fast_func(df)
print(f"向量化耗时: {time.time() - start:.4f}秒")使用eval和query
python
# 使用eval进行复杂表达式计算
start = time.time()
result = df.eval('A + B * 2')
print(f"eval耗时: {time.time() - start:.4f}秒")
# 使用query进行筛选
start = time.time()
result = df.query('A > 0 and B > 0')
print(f"query耗时: {time.time() - start:.4f}秒")使用NumPy函数
python
# Pandas操作
start = time.time()
result = df['A'].mean()
print(f"Pandas mean耗时: {time.time() - start:.6f}秒")
# NumPy操作
start = time.time()
result = np.mean(df['A'].values)
print(f"NumPy mean耗时: {time.time() - start:.6f}秒")使用Categorical优化分组
python
df = pd.DataFrame({
'类别': np.random.choice(['A', 'B', 'C', 'D'], 1000000),
'值': np.random.randn(1000000)
})
# 字符串分组
start = time.time()
result = df.groupby('类别')['值'].mean()
print(f"字符串分组耗时: {time.time() - start:.4f}秒")
# 转换为类别后分组
df['类别'] = df['类别'].astype('category')
start = time.time()
result = df.groupby('类别')['值'].mean()
print(f"类别分组耗时: {time.time() - start:.4f}秒")使用numba加速
python
try:
from numba import jit
# 使用numba加速的函数
@jit(nopython=True)
def fast_calculation(arr):
result = 0.0
for i in range(len(arr)):
result += arr[i] ** 2
return result
# 比较性能
arr = np.random.randn(1000000)
start = time.time()
result_python = sum(x**2 for x in arr)
print(f"Python耗时: {time.time() - start:.4f}秒")
start = time.time()
result_numba = fast_calculation(arr)
print(f"Numba耗时: {time.time() - start:.4f}秒")
except ImportError:
print("请安装numba: pip install numba")使用多进程
python
from multiprocessing import Pool
def process_chunk(chunk):
return chunk.groupby('类别')['值'].sum()
# 分块处理
chunks = np.array_split(df, 4)
# 单进程
start = time.time()
result_single = df.groupby('类别')['值'].sum()
print(f"单进程耗时: {time.time() - start:.4f}秒")
# 多进程
start = time.time()
with Pool(4) as pool:
results = pool.map(process_chunk, chunks)
result_multi = pd.concat(results).groupby(level=0).sum()
print(f"多进程耗时: {time.time() - start:.4f}秒")避免链式赋值
python
# 不好的做法:链式赋值(产生警告且慢)
# df[df['A'] > 0]['B'] = 1 # 不推荐
# 好的做法:使用loc
df.loc[df['A'] > 0, 'B'] = 1 # 推荐缓存中间结果
python
# 不好的做法:重复计算
result = df[df['A'] > 0]['B'].sum() + df[df['A'] > 0]['C'].sum()
# 好的做法:缓存筛选结果
filtered = df[df['A'] > 0]
result = filtered['B'].sum() + filtered['C'].sum()计算优化是提升数据处理效率的关键,选择合适的优化方法可以显著减少运行时间。