microsoft · shin4 · Apr 28, 2026
diff --git a/.gitignore b/.gitignore
@@ -184,7 +184,17 @@ static/
 # AI assistant
 .cursor/
 .claude/
+.sisyphus/
 AGENTS.md
 !rdagent/**/AGENTS.md
 
+# Custom scripts
 scripts/
+
+# Factor analysis outputs
+factor_analysis_output/
+selected_backtest_results.json
+top_factors_performance.csv
+
+# Documentation drafts
+docs/FIN_FACTOR_*.md
diff --git a/baseline_features/Net_Volume_Flow_20d.py b/baseline_features/Net_Volume_Flow_20d.py
@@ -0,0 +1,44 @@
+import pandas as pd
+import numpy as np
+
+def calculate_Net_Volume_Flow_20d():
+    # Load the daily price and volume data
+    df = pd.read_hdf("daily_pv.h5", key="data")
+
+    # Sort the dataframe by index (datetime, instrument) to ensure correct time series order
+    df = df.sort_index()
+
+    # Calculate the direction of the price movement
+    # Sign function: 1 if Close > Open, -1 if Close < Open, 0 if Close == Open
+    price_diff = df['$close'] - df['$open']
+    direction = np.sign(price_diff)
+
+    # Calculate signed volume
+    signed_volume = df['$volume'] * direction
+
+    # Calculate rolling sum of signed volume (Numerator)
+    # Window size is 20, min_periods is 20 to ensure we have a full window
+    numerator = signed_volume.groupby(level='instrument').transform(
+        lambda x: x.rolling(window=20, min_periods=20).sum()
+    )
+
+    # Calculate rolling sum of total volume (Denominator)
+    denominator = df['$volume'].groupby(level='instrument').transform(
+        lambda x: x.rolling(window=20, min_periods=20).sum()
+    )
+
+    # Calculate the factor value
+    factor_value = numerator / denominator
+
+    # Replace infinite values with NaN (can happen if total volume is 0)
+    factor_value = factor_value.replace([np.inf, -np.inf], np.nan)
+
+    # Create the result dataframe with the required column name
+    result_df = pd.DataFrame(factor_value)
+    result_df.columns = ['Net_Volume_Flow_20d']
+
+    # Save the result to result.h5
+    result_df.to_hdf("result.h5", key="data")
+
+if __name__ == "__main__":
+    calculate_Net_Volume_Flow_20d()
diff --git a/baseline_features/Return_Kurtosis_20D.py b/baseline_features/Return_Kurtosis_20D.py
@@ -0,0 +1,46 @@
+import pandas as pd
+import numpy as np
+
+def calculate_Return_Kurtosis_20D():
+    # Load the daily price and volume data
+    df = pd.read_hdf("daily_pv.h5", key="data")
+
+    # Sort index to ensure correct time series operations
+    df = df.sort_index()
+
+    # Calculate daily arithmetic returns: r_t = (Close_t - Close_{t-1}) / Close_{t-1}
+    close = df['$close']
+    prev_close = close.groupby(level='instrument').shift(1)
+    ret = (close - prev_close) / prev_close
+
+    # Calculate rolling mean of returns over the past 20 days
+    rolling_mean = ret.groupby(level='instrument').transform(
+        lambda x: x.rolling(window=20, min_periods=20).mean()
+    )
+
+    # Calculate deviations from the mean
+    dev = ret - rolling_mean
+
+    # Calculate the 4th moment (numerator part): mean of (deviation)^4
+    # Formula part: (1/20) * sum((r - r_bar)^4)
+    m4 = dev.pow(4).groupby(level='instrument').transform(
+        lambda x: x.rolling(window=20, min_periods=20).mean()
+    )
+
+    # Calculate the squared 2nd moment (denominator part): (mean of (deviation)^2)^2
+    # Formula part: ((1/20) * sum((r - r_bar)^2))^2
+    m2 = dev.pow(2).groupby(level='instrument').transform(
+        lambda x: x.rolling(window=20, min_periods=20).mean()
+    )
+
+    # Calculate Kurtosis: m4 / (m2)^2
+    kurtosis = m4 / (m2 ** 2)
+
+    # Create the result dataframe
+    result_df = kurtosis.to_frame("Return_Kurtosis_20D")
+
+    # Save the result to result.h5
+    result_df.to_hdf("result.h5", key="data")
+
+if __name__ == "__main__":
+    calculate_Return_Kurtosis_20D()
diff --git a/baseline_features/Return_Sign_Autocorrelation_20D.py b/baseline_features/Return_Sign_Autocorrelation_20D.py
@@ -0,0 +1,38 @@
+import pandas as pd
+import numpy as np
+
+def calculate_Return_Sign_Autocorrelation_20D():
+    # Load the daily price and volume data
+    df = pd.read_hdf("daily_pv.h5", key="data")
+
+    # Reset index to get 'datetime' and 'instrument' as columns for sorting
+    df = df.reset_index()
+
+    # Sort by instrument and datetime to ensure correct time-series operations
+    df = df.sort_values(by=['instrument', 'datetime'])
+
+    # Calculate daily returns r_t
+    df['ret'] = df.groupby('instrument')['$close'].pct_change()
+
+    # Calculate the sign of the daily return
+    # np.sign returns 1 for positive, -1 for negative, 0 for zero, and NaN for NaN.
+    df['sign'] = np.sign(df['ret'])
+
+    # Calculate the rolling autocorrelation of the sign series with lag 1
+    # The formulation asks for Corr(sign(r_{t-i}), sign(r_{t-i-1})) for i=0..19
+    # This is equivalent to rolling correlation between the series and its lag-1 version over a window of 20.
+    # We use min_periods=20 to ensure we have a full 20-day window.
+
+    def rolling_autocorr(x, window=20):
+        return x.rolling(window=window, min_periods=window).corr(x.shift(1))
+
+    df['Return_Sign_Autocorrelation_20D'] = df.groupby('instrument')['sign'].transform(rolling_autocorr)
+
+    # Set the index back to ['datetime', 'instrument'] as required
+    result = df.set_index(['datetime', 'instrument'])[['Return_Sign_Autocorrelation_20D']]
+
+    # Save the result to result.h5
+    result.to_hdf("result.h5", key="data")
+
+if __name__ == "__main__":
+    calculate_Return_Sign_Autocorrelation_20D()
diff --git a/baseline_features/Return_Skewness_20D.py b/baseline_features/Return_Skewness_20D.py
@@ -0,0 +1,37 @@
+import pandas as pd
+import numpy as np
+
+def calculate_return_skewness_20d():
+    # Read the daily price and volume data
+    df = pd.read_hdf("daily_pv.h5", key="data")
+
+    # Sort by instrument and datetime
+    df = df.sort_index()
+
+    # Calculate daily returns: r_t = C_t / C_{t-1} - 1
+    df['return'] = df.groupby(level='instrument')['$close'].pct_change()
+
+    # Define population skewness function
+    # Skewness = (1/N) * sum(((r - mean) / std)^3)
+    def pop_skewness(x):
+        mean = np.mean(x)
+        std = np.std(x, ddof=0)  # population standard deviation
+        if std == 0 or np.isnan(std):
+            return np.nan
+        return np.mean((x - mean) ** 3) / (std ** 3)
+
+    # Calculate rolling skewness over 20-day window
+    df['Return_Skewness_20D'] = df.groupby(level='instrument')['return'].transform(
+        lambda x: x.rolling(window=20, min_periods=20).apply(pop_skewness, raw=True)
+    )
+
+    # Prepare the result
+    result = df[['Return_Skewness_20D']].copy()
+
+    # Save to HDF5 file
+    result.to_hdf("result.h5", key="data")
+
+    return result
+
+if __name__ == "__main__":
+    calculate_return_skewness_20d()
diff --git a/baseline_features/Return_ZScore_20D.py b/baseline_features/Return_ZScore_20D.py
@@ -0,0 +1,39 @@
+import pandas as pd
+import numpy as np
+
+def calculate_Return_ZScore_20D():
+    # Load the daily price and volume data
+    df = pd.read_hdf("daily_pv.h5", key="data")
+
+    # Sort index to ensure correct time series operations
+    df = df.sort_index()
+
+    # Calculate daily simple returns: r_t = (Close_t - Close_{t-1}) / Close_{t-1}
+    # Using pct_change() is equivalent to the formula.
+    # We group by instrument to ensure we don't mix returns between different stocks.
+    close = df['$close']
+    ret = close.groupby(level='instrument').pct_change()
+
+    # Calculate the 20-day rolling mean of returns
+    # window=20, min_periods=20 ensures we use exactly 20 data points as per formula
+    rolling_mean = ret.groupby(level='instrument').transform(
+        lambda x: x.rolling(window=20, min_periods=20).mean()
+    )
+
+    # Calculate the 20-day rolling standard deviation of returns
+    # Pandas std() uses ddof=1 by default, which matches the formula's denominator (N-1) = 19
+    rolling_std = ret.groupby(level='instrument').transform(
+        lambda x: x.rolling(window=20, min_periods=20).std()
+    )
+
+    # Calculate Z-Score: F_t = (r_t - mu_{t,20}) / sigma_{t,20}
+    z_score = (ret - rolling_mean) / rolling_std
+
+    # Create the result dataframe
+    result_df = z_score.to_frame("Return_ZScore_20D")
+
+    # Save the result to result.h5
+    result_df.to_hdf("result.h5", key="data")
+
+if __name__ == "__main__":
+    calculate_Return_ZScore_20D()
diff --git a/baseline_features/Risk_Adjusted_Momentum_20D.py b/baseline_features/Risk_Adjusted_Momentum_20D.py
@@ -0,0 +1,41 @@
+import pandas as pd
+import numpy as np
+
+def calculate_Risk_Adjusted_Momentum_20D():
+    # Load the daily price and volume data
+    df = pd.read_hdf("daily_pv.h5", key="data")
+
+    # Sort index to ensure correct time series operations
+    df = df.sort_index()
+
+    # Get close price
+    close = df['$close']
+
+    # 1. Calculate R_{t, 20}: The cumulative return over the past 20 trading days.
+    # Formula: (Close_t - Close_{t-20}) / Close_{t-20}
+    # Using pct_change(20) grouped by instrument
+    cumulative_return = close.groupby(level='instrument').pct_change(20)
+
+    # 2. Calculate sigma_{t, 20}: The standard deviation of daily logarithmic returns over the past 20 trading days.
+    # Step 2a: Calculate daily logarithmic returns r_t = ln(Close_t / Close_{t-1})
+    prev_close = close.groupby(level='instrument').shift(1)
+    log_ret = np.log(close / prev_close)
+
+    # Step 2b: Calculate rolling standard deviation
+    # window=20, min_periods=20 ensures exactly 20 data points are used
+    # std() uses ddof=1 by default, matching the formula 1/(N-1)
+    rolling_std = log_ret.groupby(level='instrument').transform(
+        lambda x: x.rolling(window=20, min_periods=20).std()
+    )
+
+    # 3. Calculate the factor: F_t = R_{t, 20} / sigma_{t, 20}
+    factor_value = cumulative_return / rolling_std
+
+    # Create the result dataframe
+    result_df = factor_value.to_frame("Risk_Adjusted_Momentum_20D")
+
+    # Save the result to result.h5
+    result_df.to_hdf("result.h5", key="data")
+
+if __name__ == "__main__":
+    calculate_Risk_Adjusted_Momentum_20D()
diff --git a/baseline_features/Volatility_20D.py b/baseline_features/Volatility_20D.py
@@ -0,0 +1,44 @@
+import pandas as pd
+import numpy as np
+
+def calculate_Volatility_20D():
+    # Load the daily price and volume data
+    df = pd.read_hdf("daily_pv.h5", key="data")
+
+    # Sort index to ensure correct time series operations
+    df = df.sort_index()
+
+    # Extract adjusted close prices
+    # The data description states it is adjusted daily price data
+    close = df['$close']
+
+    # Unstack to perform calculations across instruments (wide format)
+    # Resulting index: datetime, columns: instruments
+    close_unstacked = close.unstack(level='instrument')
+
+    # Calculate daily logarithmic returns: r_t = ln(P_t / P_{t-1})
+    # log(P_t) - log(P_{t-1})
+    log_returns = np.log(close_unstacked) - np.log(close_unstacked.shift(1))
+
+    # Calculate rolling standard deviation over a 20-day window
+    # Pandas rolling().std() uses ddof=1 by default, which matches the sample standard deviation formula
+    # Formula: sqrt(1/(N-1) * sum((r - mean)^2))
+    volatility = log_returns.rolling(window=20).std()
+
+    # Stack back to the original MultiIndex format (datetime, instrument)
+    factor_series = volatility.stack()
+
+    # Name the series according to the factor definition
+    factor_series.name = "Volatility_20D"
+
+    # Convert to DataFrame
+    result_df = factor_series.to_frame()
+
+    # Ensure index names are correct
+    result_df.index.names = ['datetime', 'instrument']
+
+    # Save the result to result.h5
+    result_df.to_hdf("result.h5", key="data")
+
+if __name__ == "__main__":
+    calculate_Volatility_20D()
diff --git a/baseline_features/Volume_Momentum_20D.py b/baseline_features/Volume_Momentum_20D.py
@@ -0,0 +1,31 @@
+import pandas as pd
+
+def calculate_Volume_Momentum_20D():
+    # Load the daily price and volume data
+    df = pd.read_hdf("daily_pv.h5", key="data")
+
+    # Sort the dataframe by instrument and datetime to ensure correct rolling calculation
+    df = df.sort_index()
+
+    # Calculate the 5-day moving average of volume for each instrument (Numerator)
+    # window=5 corresponds to the average of current day and previous 4 days
+    # min_periods=5 ensures the average is calculated only when 5 days of data are available
+    vol_ma_5 = df.groupby(level='instrument')['$volume'].transform(
+        lambda x: x.rolling(window=5, min_periods=5).mean()
+    )
+
+    # Calculate the denominator: 5-day average volume shifted back by 20 days
+    # This represents the average volume for the window [t-20, t-24]
+    vol_ma_5_lag20 = vol_ma_5.groupby(level='instrument').shift(20)
+
+    # Calculate the factor value: Current 5-day avg volume / 5-day avg volume 20 days ago
+    factor_values = vol_ma_5 / vol_ma_5_lag20
+
+    # Create the result dataframe with the required format
+    result_df = factor_values.to_frame('Volume_Momentum_20D')
+
+    # Save the result to result.h5
+    result_df.to_hdf("result.h5", key="data")
+
+if __name__ == "__main__":
+    calculate_Volume_Momentum_20D()
diff --git a/baseline_features/Volume_ZScore_20D.py b/baseline_features/Volume_ZScore_20D.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import numpy as np
+
+def calculate_Volume_ZScore_20D():
+    # Load the daily price and volume data
+    df = pd.read_hdf("daily_pv.h5", key="data")
+
+    # Sort the dataframe by datetime to ensure the time-series operations are correct
+    # The index is MultiIndex with levels ['datetime', 'instrument']
+    df = df.sort_index(level='datetime')
+
+    # Reshape the volume data to wide format (dates as index, instruments as columns)
+    # This avoids index alignment issues associated with groupby().rolling()
+    volume_wide = df['$volume'].unstack(level='instrument')
+
+    # Calculate 20-day rolling mean and standard deviation
+    rolling_mean = volume_wide.rolling(window=20).mean()
+    rolling_std = volume_wide.rolling(window=20).std()
+
+    # Calculate the Z-Score: (Volume - Moving Average) / Moving Standard Deviation
+    factor_values_wide = (volume_wide - rolling_mean) / rolling_std
+
+    # Replace infinite values with NaN (occurs if std is 0)
+    factor_values_wide.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+    # Reshape back to long format (MultiIndex: datetime, instrument)
+    factor_values = factor_values_wide.stack()
+
+    # Create the result dataframe
+    result_df = factor_values.to_frame('Volume_ZScore_20D')
+
+    # Save the result to result.h5
+    result_df.to_hdf("result.h5", key="data")
+
+if __name__ == "__main__":
+    calculate_Volume_ZScore_20D()