Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,17 @@ static/
# AI assistant
.cursor/
.claude/
.sisyphus/
AGENTS.md
!rdagent/**/AGENTS.md

# Custom scripts
scripts/

# Factor analysis outputs
factor_analysis_output/
selected_backtest_results.json
top_factors_performance.csv

# Documentation drafts
docs/FIN_FACTOR_*.md
44 changes: 44 additions & 0 deletions baseline_features/Net_Volume_Flow_20d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pandas as pd
import numpy as np

def calculate_Net_Volume_Flow_20d():
# Load the daily price and volume data
df = pd.read_hdf("daily_pv.h5", key="data")

# Sort the dataframe by index (datetime, instrument) to ensure correct time series order
df = df.sort_index()

# Calculate the direction of the price movement
# Sign function: 1 if Close > Open, -1 if Close < Open, 0 if Close == Open
price_diff = df['$close'] - df['$open']
direction = np.sign(price_diff)

# Calculate signed volume
signed_volume = df['$volume'] * direction

# Calculate rolling sum of signed volume (Numerator)
# Window size is 20, min_periods is 20 to ensure we have a full window
numerator = signed_volume.groupby(level='instrument').transform(
lambda x: x.rolling(window=20, min_periods=20).sum()
)

# Calculate rolling sum of total volume (Denominator)
denominator = df['$volume'].groupby(level='instrument').transform(
lambda x: x.rolling(window=20, min_periods=20).sum()
)

# Calculate the factor value
factor_value = numerator / denominator

# Replace infinite values with NaN (can happen if total volume is 0)
factor_value = factor_value.replace([np.inf, -np.inf], np.nan)

# Create the result dataframe with the required column name
result_df = pd.DataFrame(factor_value)
result_df.columns = ['Net_Volume_Flow_20d']

# Save the result to result.h5
result_df.to_hdf("result.h5", key="data")

if __name__ == "__main__":
calculate_Net_Volume_Flow_20d()
46 changes: 46 additions & 0 deletions baseline_features/Return_Kurtosis_20D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import pandas as pd
import numpy as np

def calculate_Return_Kurtosis_20D():
# Load the daily price and volume data
df = pd.read_hdf("daily_pv.h5", key="data")

# Sort index to ensure correct time series operations
df = df.sort_index()

# Calculate daily arithmetic returns: r_t = (Close_t - Close_{t-1}) / Close_{t-1}
close = df['$close']
prev_close = close.groupby(level='instrument').shift(1)
ret = (close - prev_close) / prev_close

# Calculate rolling mean of returns over the past 20 days
rolling_mean = ret.groupby(level='instrument').transform(
lambda x: x.rolling(window=20, min_periods=20).mean()
)

# Calculate deviations from the mean
dev = ret - rolling_mean

# Calculate the 4th moment (numerator part): mean of (deviation)^4
# Formula part: (1/20) * sum((r - r_bar)^4)
m4 = dev.pow(4).groupby(level='instrument').transform(
lambda x: x.rolling(window=20, min_periods=20).mean()
)

# Calculate the squared 2nd moment (denominator part): (mean of (deviation)^2)^2
# Formula part: ((1/20) * sum((r - r_bar)^2))^2
m2 = dev.pow(2).groupby(level='instrument').transform(
lambda x: x.rolling(window=20, min_periods=20).mean()
)

# Calculate Kurtosis: m4 / (m2)^2
kurtosis = m4 / (m2 ** 2)

# Create the result dataframe
result_df = kurtosis.to_frame("Return_Kurtosis_20D")

# Save the result to result.h5
result_df.to_hdf("result.h5", key="data")

if __name__ == "__main__":
calculate_Return_Kurtosis_20D()
38 changes: 38 additions & 0 deletions baseline_features/Return_Sign_Autocorrelation_20D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import pandas as pd
import numpy as np

def calculate_Return_Sign_Autocorrelation_20D():
# Load the daily price and volume data
df = pd.read_hdf("daily_pv.h5", key="data")

# Reset index to get 'datetime' and 'instrument' as columns for sorting
df = df.reset_index()

# Sort by instrument and datetime to ensure correct time-series operations
df = df.sort_values(by=['instrument', 'datetime'])

# Calculate daily returns r_t
df['ret'] = df.groupby('instrument')['$close'].pct_change()

# Calculate the sign of the daily return
# np.sign returns 1 for positive, -1 for negative, 0 for zero, and NaN for NaN.
df['sign'] = np.sign(df['ret'])

# Calculate the rolling autocorrelation of the sign series with lag 1
# The formulation asks for Corr(sign(r_{t-i}), sign(r_{t-i-1})) for i=0..19
# This is equivalent to rolling correlation between the series and its lag-1 version over a window of 20.
# We use min_periods=20 to ensure we have a full 20-day window.

def rolling_autocorr(x, window=20):
return x.rolling(window=window, min_periods=window).corr(x.shift(1))

df['Return_Sign_Autocorrelation_20D'] = df.groupby('instrument')['sign'].transform(rolling_autocorr)

# Set the index back to ['datetime', 'instrument'] as required
result = df.set_index(['datetime', 'instrument'])[['Return_Sign_Autocorrelation_20D']]

# Save the result to result.h5
result.to_hdf("result.h5", key="data")

if __name__ == "__main__":
calculate_Return_Sign_Autocorrelation_20D()
37 changes: 37 additions & 0 deletions baseline_features/Return_Skewness_20D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pandas as pd
import numpy as np

def calculate_return_skewness_20d():
# Read the daily price and volume data
df = pd.read_hdf("daily_pv.h5", key="data")

# Sort by instrument and datetime
df = df.sort_index()

# Calculate daily returns: r_t = C_t / C_{t-1} - 1
df['return'] = df.groupby(level='instrument')['$close'].pct_change()

# Define population skewness function
# Skewness = (1/N) * sum(((r - mean) / std)^3)
def pop_skewness(x):
mean = np.mean(x)
std = np.std(x, ddof=0) # population standard deviation
if std == 0 or np.isnan(std):
return np.nan
return np.mean((x - mean) ** 3) / (std ** 3)

# Calculate rolling skewness over 20-day window
df['Return_Skewness_20D'] = df.groupby(level='instrument')['return'].transform(
lambda x: x.rolling(window=20, min_periods=20).apply(pop_skewness, raw=True)
)

# Prepare the result
result = df[['Return_Skewness_20D']].copy()

# Save to HDF5 file
result.to_hdf("result.h5", key="data")

return result

if __name__ == "__main__":
calculate_return_skewness_20d()
39 changes: 39 additions & 0 deletions baseline_features/Return_ZScore_20D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import pandas as pd
import numpy as np

def calculate_Return_ZScore_20D():
# Load the daily price and volume data
df = pd.read_hdf("daily_pv.h5", key="data")

# Sort index to ensure correct time series operations
df = df.sort_index()

# Calculate daily simple returns: r_t = (Close_t - Close_{t-1}) / Close_{t-1}
# Using pct_change() is equivalent to the formula.
# We group by instrument to ensure we don't mix returns between different stocks.
close = df['$close']
ret = close.groupby(level='instrument').pct_change()

# Calculate the 20-day rolling mean of returns
# window=20, min_periods=20 ensures we use exactly 20 data points as per formula
rolling_mean = ret.groupby(level='instrument').transform(
lambda x: x.rolling(window=20, min_periods=20).mean()
)

# Calculate the 20-day rolling standard deviation of returns
# Pandas std() uses ddof=1 by default, which matches the formula's denominator (N-1) = 19
rolling_std = ret.groupby(level='instrument').transform(
lambda x: x.rolling(window=20, min_periods=20).std()
)

# Calculate Z-Score: F_t = (r_t - mu_{t,20}) / sigma_{t,20}
z_score = (ret - rolling_mean) / rolling_std

# Create the result dataframe
result_df = z_score.to_frame("Return_ZScore_20D")

# Save the result to result.h5
result_df.to_hdf("result.h5", key="data")

if __name__ == "__main__":
calculate_Return_ZScore_20D()
41 changes: 41 additions & 0 deletions baseline_features/Risk_Adjusted_Momentum_20D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd
import numpy as np

def calculate_Risk_Adjusted_Momentum_20D():
# Load the daily price and volume data
df = pd.read_hdf("daily_pv.h5", key="data")

# Sort index to ensure correct time series operations
df = df.sort_index()

# Get close price
close = df['$close']

# 1. Calculate R_{t, 20}: The cumulative return over the past 20 trading days.
# Formula: (Close_t - Close_{t-20}) / Close_{t-20}
# Using pct_change(20) grouped by instrument
cumulative_return = close.groupby(level='instrument').pct_change(20)

# 2. Calculate sigma_{t, 20}: The standard deviation of daily logarithmic returns over the past 20 trading days.
# Step 2a: Calculate daily logarithmic returns r_t = ln(Close_t / Close_{t-1})
prev_close = close.groupby(level='instrument').shift(1)
log_ret = np.log(close / prev_close)

# Step 2b: Calculate rolling standard deviation
# window=20, min_periods=20 ensures exactly 20 data points are used
# std() uses ddof=1 by default, matching the formula 1/(N-1)
rolling_std = log_ret.groupby(level='instrument').transform(
lambda x: x.rolling(window=20, min_periods=20).std()
)

# 3. Calculate the factor: F_t = R_{t, 20} / sigma_{t, 20}
factor_value = cumulative_return / rolling_std

# Create the result dataframe
result_df = factor_value.to_frame("Risk_Adjusted_Momentum_20D")

# Save the result to result.h5
result_df.to_hdf("result.h5", key="data")

if __name__ == "__main__":
calculate_Risk_Adjusted_Momentum_20D()
44 changes: 44 additions & 0 deletions baseline_features/Volatility_20D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import pandas as pd
import numpy as np

def calculate_Volatility_20D():
# Load the daily price and volume data
df = pd.read_hdf("daily_pv.h5", key="data")

# Sort index to ensure correct time series operations
df = df.sort_index()

# Extract adjusted close prices
# The data description states it is adjusted daily price data
close = df['$close']

# Unstack to perform calculations across instruments (wide format)
# Resulting index: datetime, columns: instruments
close_unstacked = close.unstack(level='instrument')

# Calculate daily logarithmic returns: r_t = ln(P_t / P_{t-1})
# log(P_t) - log(P_{t-1})
log_returns = np.log(close_unstacked) - np.log(close_unstacked.shift(1))

# Calculate rolling standard deviation over a 20-day window
# Pandas rolling().std() uses ddof=1 by default, which matches the sample standard deviation formula
# Formula: sqrt(1/(N-1) * sum((r - mean)^2))
volatility = log_returns.rolling(window=20).std()

# Stack back to the original MultiIndex format (datetime, instrument)
factor_series = volatility.stack()

# Name the series according to the factor definition
factor_series.name = "Volatility_20D"

# Convert to DataFrame
result_df = factor_series.to_frame()

# Ensure index names are correct
result_df.index.names = ['datetime', 'instrument']

# Save the result to result.h5
result_df.to_hdf("result.h5", key="data")

if __name__ == "__main__":
calculate_Volatility_20D()
31 changes: 31 additions & 0 deletions baseline_features/Volume_Momentum_20D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pandas as pd

def calculate_Volume_Momentum_20D():
# Load the daily price and volume data
df = pd.read_hdf("daily_pv.h5", key="data")

# Sort the dataframe by instrument and datetime to ensure correct rolling calculation
df = df.sort_index()

# Calculate the 5-day moving average of volume for each instrument (Numerator)
# window=5 corresponds to the average of current day and previous 4 days
# min_periods=5 ensures the average is calculated only when 5 days of data are available
vol_ma_5 = df.groupby(level='instrument')['$volume'].transform(
lambda x: x.rolling(window=5, min_periods=5).mean()
)

# Calculate the denominator: 5-day average volume shifted back by 20 days
# This represents the average volume for the window [t-20, t-24]
vol_ma_5_lag20 = vol_ma_5.groupby(level='instrument').shift(20)

# Calculate the factor value: Current 5-day avg volume / 5-day avg volume 20 days ago
factor_values = vol_ma_5 / vol_ma_5_lag20

# Create the result dataframe with the required format
result_df = factor_values.to_frame('Volume_Momentum_20D')

# Save the result to result.h5
result_df.to_hdf("result.h5", key="data")

if __name__ == "__main__":
calculate_Volume_Momentum_20D()
36 changes: 36 additions & 0 deletions baseline_features/Volume_ZScore_20D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pandas as pd
import numpy as np

def calculate_Volume_ZScore_20D():
# Load the daily price and volume data
df = pd.read_hdf("daily_pv.h5", key="data")

# Sort the dataframe by datetime to ensure the time-series operations are correct
# The index is MultiIndex with levels ['datetime', 'instrument']
df = df.sort_index(level='datetime')

# Reshape the volume data to wide format (dates as index, instruments as columns)
# This avoids index alignment issues associated with groupby().rolling()
volume_wide = df['$volume'].unstack(level='instrument')

# Calculate 20-day rolling mean and standard deviation
rolling_mean = volume_wide.rolling(window=20).mean()
rolling_std = volume_wide.rolling(window=20).std()

# Calculate the Z-Score: (Volume - Moving Average) / Moving Standard Deviation
factor_values_wide = (volume_wide - rolling_mean) / rolling_std

# Replace infinite values with NaN (occurs if std is 0)
factor_values_wide.replace([np.inf, -np.inf], np.nan, inplace=True)

# Reshape back to long format (MultiIndex: datetime, instrument)
factor_values = factor_values_wide.stack()

# Create the result dataframe
result_df = factor_values.to_frame('Volume_ZScore_20D')

# Save the result to result.h5
result_df.to_hdf("result.h5", key="data")

if __name__ == "__main__":
calculate_Volume_ZScore_20D()
Loading
Loading