转自微信公众号 《数据科学实战》
import numpy as np
# 从 Python 列表创建
price_list = [143.73, 145.83, 143.68, 144.02, 143.5, 142.62]
price_array = np.array(price_list)
print(f"一维数组:{price_array}")
# 从列表的列表创建二维数组(矩阵)
ohlc_data = np.array([[143.73, 145.90, 143.50, 145.83],
[145.83, 146.20, 143.68, 143.68]])
print(f"\n二维数组(矩阵):\n{ohlc_data}")
# 专用数组创建函数
zeros_array = np.zeros(5) # 5个零的数组
ones_matrix = np.ones((2, 3)) # 2x3的全1矩阵
price_range = np.linspace(100, 110, 11) # 100到110之间均匀分布的11个点
random_returns = np.random.randn(10) # 10个来自标准正态分布的随机数
print(f"\n全零数组:{zeros_array}")
print(f"\n全一矩阵:\n{ones_matrix}")
print(f"\n线性间隔数组:{price_range}")
print(f"\n模拟随机收益率:{random_returns}")
import time
# 模拟一个包含5000个资产的大型投资组合
num_assets = 5000
# 生成随机权重(总和为1)
weights = np.random.random(num_assets)
weights /= np.sum(weights)
# 为每个资产生成随机收益率
returns = np.random.randn(num_assets) * 0.01 # 小的日收益率
# --- 方法1:Python 循环 ---
start_time_loop = time.time()
portfolio_return_loop = 0.0
for i in range(num_assets):
portfolio_return_loop += weights[i] * returns[i]
end_time_loop = time.time()
time_loop = (end_time_loop - start_time_loop) * 1000 # 毫秒
print(f"投资组合收益率(循环法):{portfolio_return_loop:.6f}")
print(f"耗时(循环法):{time_loop:.4f} 毫秒")
# --- 方法2:NumPy 向量化点积 ---
start_time_np = time.time()
portfolio_return_np = np.dot(weights, returns)
end_time_np = time.time()
time_np = (end_time_np - start_time_np) * 1000 # 毫秒
print(f"\n投资组合收益率(NumPy法):{portfolio_return_np:.6f}")
print(f"耗时(NumPy法):{time_np:.4f} 毫秒")
# --- 性能比较 ---
print(f"\nNumPy 方法大约比循环快 {time_loop/time_np:.0f} 倍。")
pip install numpy-financial
import numpy_financial as npf
# 示例:计算项目的净现值(NPV)
# 一个项目需要10万美元的初始投资
# 预计在4年内产生3万、4万、5万和6万美元的现金流
# 折现率为8%
rate = 0.08
cash_flows = np.array([-100000, 30000, 40000, 50000, 60000])
# npv函数计算未来现金流的净现值(从第1年开始)
# 所以我们计算后再加上初始投资
net_present_value = npf.npv(rate, cash_flows[1:]) + cash_flows[0]
print(f"项目现金流:{cash_flows}")
print(f"折现率:{rate:.2%}")
print(f"净现值(NPV):${net_present_value:,.2f}")
# 示例:计算内部收益率(IRR)
internal_rate_of_return = npf.irr(cash_flows)
print(f"内部收益率(IRR):{internal_rate_of_return:.2%}")
import pandas as pd
# 创建股票价格的 Series
aapl_prices = pd.Series([171.5, 172.3, 170.9, 173.1],
index=['2023-11-01', '2023-11-02', '2023-11-03', '2023-11-04'])
print("Pandas Series:")
print(aapl_prices)
print(f"\n2023-11-02 的价格:{aapl_prices['2023-11-02']}")
# 从字典创建 DataFrame
data = {'Open': [171.5, 172.3, 170.9, 173.1],
'High': [172.8, 173.5, 171.2, 174.0],
'Low': [170.1, 171.8, 170.5, 172.5],
'Close': [172.3, 170.9, 173.1, 173.9],
'Volume': [5.2e7, 4.8e7, 5.5e7, 4.9e7]}
dates = pd.to_datetime(['2023-11-01', '2023-11-02', '2023-11-03', '2023-11-04'])
df = pd.DataFrame(data, index=dates)
print("\nPandas DataFrame:")
print(df)
# 通过标签(日期)选择单行
print("\n使用 .loc 获取 2023-11-02 的数据:")
print(df.loc['2023-11-02'])
# 通过行列标签选择单个值
close_price = df.loc['2023-11-03', 'Close']
print(f"\n2023-11-03 的收盘价:{close_price}")
# 选择行和特定列的切片
print("\n使用 .loc 选择行和列的切片:")
print(df.loc['2023-11-02':'2023-11-04', ['Open', 'Close']])
# 选择第一行(索引为0)
print("\n使用 .iloc 获取第一行数据:")
print(df.iloc[0])
# 选择第3行、第4列的值(索引均为3)
volume_val = df.iloc[3, 4]
print(f"\n第4天的成交量:{volume_val}")
# 选择第1行和第2行,以及第0列和第3列
print("\n使用 .iloc 选择行和列的切片:")
print(df.iloc[1:3, [0, 3]])
# 找出所有收盘价高于172的日子
high_close_days = df[df['Close'] > 172]
print("\n收盘价 > 172 的日子:")
print(high_close_days)
# 组合多个条件:找出成交量高且价格区间大的日子
high_volume_threshold = 5.0e7
large_range_threshold = 2.0
active_days = df[(df['Volume'] > high_volume_threshold) &
((df['High'] - df['Low']) > large_range_threshold)]
print("\n高成交量、大价格区间的日子:")
print(active_days)
