diff --git a/eval_data/ohlcv_sample.csv b/eval_data/ohlcv_sample.csv new file mode 100644 index 00000000..ad335788 --- /dev/null +++ b/eval_data/ohlcv_sample.csv @@ -0,0 +1,6 @@ +date,symbol,open,high,low,close,volume +2020-01-01,AAPL,75,76,74,75.5,1000000 +2020-01-02,AAPL,75.5,77,75,76.8,1200000 +2020-01-03,AAPL,76.8,78,76,77.5,1100000 +2020-01-04,AAPL,77.5,79,77,78.2,1300000 +2020-01-05,AAPL,78.2,80,78,79.5,1250000 \ No newline at end of file diff --git a/examples/data_split.py b/examples/data_split.py new file mode 100644 index 00000000..1e4b0a84 --- /dev/null +++ b/examples/data_split.py @@ -0,0 +1,21 @@ +import pandas as pd + +df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + +def split_data(df, train_size=3, test_size=1): + splits = [] + for start in range(0, len(df) - train_size - test_size + 1): + train = df.iloc[start:start + train_size] + test = df.iloc[start + train_size:start + train_size + test_size] + splits.append((train, test)) + return splits + +splits = split_data(df) + +for i, (train, test) in enumerate(splits): + print(f"Split {i}") + print("Train:") + print(train[["date", "close"]]) + print("Test:") + print(test[["date", "close"]]) + print("-" * 20) \ No newline at end of file diff --git a/examples/leaky_strategy.py b/examples/leaky_strategy.py new file mode 100644 index 00000000..c5f457b3 --- /dev/null +++ b/examples/leaky_strategy.py @@ -0,0 +1,11 @@ +import pandas as pd + +df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + +# Intentionally bad: uses tomorrow's close today. +df["past_return"] = df["close"] / df["close"].shift(1) + +df["signal"] = df["past_return"] > 1 +df["strategy_return"] = df["signal"] * df["past_return"] + +print(df[["date", "symbol", "close", "past_return", "signal", "strategy_return"]]) \ No newline at end of file diff --git a/examples/metrics_report.py b/examples/metrics_report.py new file mode 100644 index 00000000..e94439c6 --- /dev/null +++ b/examples/metrics_report.py @@ -0,0 +1,28 @@ +import pandas as pd + +df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + +df["past_return"] = df["close"] / df["close"].shift(1) +df["signal"] = df["past_return"] > 1 + +fee_rate = 0.001 +slippage_rate = 0.0005 + +df["trade"] = df["signal"].astype(int).diff().abs().fillna(df["signal"].astype(int)) +df["gross_return"] = df["signal"] * df["past_return"] +df["cost"] = df["trade"] * (fee_rate + slippage_rate) +df["net_return"] = (df["gross_return"] - df["cost"]).fillna(0) + +total_return = df["net_return"].sum() +num_trades = int(df["trade"].sum()) +max_drawdown = (df["net_return"].cummax() - df["net_return"]).max() +sharpe = df["net_return"].mean() / df["net_return"].std() if df["net_return"].std() != 0 else 0 + +metrics = { + "total_return": total_return, + "sharpe": sharpe, + "max_drawdown": max_drawdown, + "num_trades": num_trades +} + +print(metrics) \ No newline at end of file diff --git a/examples/safe_optimizer.py b/examples/safe_optimizer.py new file mode 100644 index 00000000..6db33314 --- /dev/null +++ b/examples/safe_optimizer.py @@ -0,0 +1,43 @@ +import pandas as pd + +df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + +df["past_return"] = df["close"] / df["close"].shift(1) + +def run_strategy(data, threshold): + data = data.copy() + data["signal"] = data["past_return"] > threshold + data["strategy_return"] = data["signal"] * data["past_return"] + return data["strategy_return"].fillna(0).sum() + +def split_data(df, train_size=3, test_size=1): + splits = [] + for start in range(0, len(df) - train_size - test_size + 1): + train = df.iloc[start:start + train_size] + test = df.iloc[start + train_size:start + train_size + test_size] + splits.append((train, test)) + return splits + +thresholds = [1.005, 1.01, 1.015] +results = [] + +for split_id, (train, test) in enumerate(split_data(df)): + train_scores = {} + + for threshold in thresholds: + train_scores[threshold] = run_strategy(train, threshold) + + best_threshold = max(train_scores, key=train_scores.get) + + test_score = run_strategy(test, best_threshold) + + results.append({ + "split": split_id, + "best_threshold": best_threshold, + "train_score": train_scores[best_threshold], + "test_score": test_score + }) + +results_df = pd.DataFrame(results) + +print(results_df) \ No newline at end of file diff --git a/examples/trading_costs.py b/examples/trading_costs.py new file mode 100644 index 00000000..e35a9c28 --- /dev/null +++ b/examples/trading_costs.py @@ -0,0 +1,16 @@ +import pandas as pd + +df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + +df["past_return"] = df["close"] / df["close"].shift(1) +df["signal"] = df["past_return"] > 1 + +fee_rate = 0.001 # 0.1% fee +slippage_rate = 0.0005 # 0.05% slippage + +df["trade"] = df["signal"].astype(int).diff().abs().fillna(df["signal"].astype(int)) +df["gross_return"] = df["signal"] * df["past_return"] +df["cost"] = df["trade"] * (fee_rate + slippage_rate) +df["net_return"] = df["gross_return"] - df["cost"] + +print(df[["date", "close", "signal", "trade", "gross_return", "cost", "net_return"]]) \ No newline at end of file diff --git a/examples/walk_forward.py b/examples/walk_forward.py new file mode 100644 index 00000000..846327e4 --- /dev/null +++ b/examples/walk_forward.py @@ -0,0 +1,28 @@ +import pandas as pd + +df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + +df["past_return"] = df["close"] / df["close"].shift(1) + +# parameters +train_size = 3 +test_size = 1 + +results = [] + +for start in range(0, len(df) - train_size - test_size + 1): + train = df.iloc[start:start + train_size] + test = df.iloc[start + train_size:start + train_size + test_size] + + # simple rule learned from train + threshold = train["past_return"].mean() + + test = test.copy() + test["signal"] = test["past_return"] > threshold + test["strategy_return"] = test["signal"] * test["past_return"] + + results.append(test) + +final = pd.concat(results) + +print(final[["date", "close", "past_return", "signal", "strategy_return"]]) \ No newline at end of file diff --git a/tests/test_leakage_detection.py b/tests/test_leakage_detection.py new file mode 100644 index 00000000..209961c3 --- /dev/null +++ b/tests/test_leakage_detection.py @@ -0,0 +1,13 @@ +import pandas as pd + +def test_no_future_data_used(): + df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + + # SAFE logic (past only) + df["past_return"] = df["close"] / df["close"].shift(1) + + # Ensure first value is NaN (no future access) + assert pd.isna(df["past_return"].iloc[0]) + + # Ensure no use of future data + assert "future_return" not in df.columns \ No newline at end of file diff --git a/tests/test_metrics_report.py b/tests/test_metrics_report.py new file mode 100644 index 00000000..54fe8b8d --- /dev/null +++ b/tests/test_metrics_report.py @@ -0,0 +1,21 @@ +import pandas as pd + +def test_metrics_exist(): + df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + + df["past_return"] = df["close"] / df["close"].shift(1) + df["signal"] = df["past_return"] > 1 + df["trade"] = df["signal"].astype(int).diff().abs().fillna(df["signal"].astype(int)) + df["net_return"] = df["past_return"].fillna(0) + + metrics = { + "total_return": df["net_return"].sum(), + "sharpe": 0, + "max_drawdown": 0, + "num_trades": int(df["trade"].sum()) + } + + assert "total_return" in metrics + assert "sharpe" in metrics + assert "max_drawdown" in metrics + assert "num_trades" in metrics \ No newline at end of file diff --git a/tests/test_safe_optimizer.py b/tests/test_safe_optimizer.py new file mode 100644 index 00000000..d1f1aec3 --- /dev/null +++ b/tests/test_safe_optimizer.py @@ -0,0 +1,17 @@ +import pandas as pd + +def test_optimizer_uses_train_before_test(): + df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + df["past_return"] = df["close"] / df["close"].shift(1) + + train = df.iloc[:3] + test = df.iloc[3:4] + + assert train.index.max() < test.index.min() + +def test_optimizer_does_not_use_future_return(): + df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + df["past_return"] = df["close"] / df["close"].shift(1) + + assert "future_return" not in df.columns + assert pd.isna(df["past_return"].iloc[0]) \ No newline at end of file diff --git a/tests/test_walk_forward.py b/tests/test_walk_forward.py new file mode 100644 index 00000000..88e0a8e1 --- /dev/null +++ b/tests/test_walk_forward.py @@ -0,0 +1,17 @@ +import pandas as pd + +def test_walk_forward_no_leakage(): + df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + + df["past_return"] = df["close"] / df["close"].shift(1) + + train = df.iloc[:3] + test = df.iloc[3:4] + + threshold = train["past_return"].mean() + + test = test.copy() + test["signal"] = test["past_return"] > threshold + + # ensure test does not use future data + assert test.index.min() > train.index.max() \ No newline at end of file diff --git a/vectorbt/examples/leaky_strategy.py b/vectorbt/examples/leaky_strategy.py new file mode 100644 index 00000000..b1759802 --- /dev/null +++ b/vectorbt/examples/leaky_strategy.py @@ -0,0 +1,16 @@ +import pandas as pd + +# Load data +df = pd.read_csv("eval_data/ohlcv_sample.csv", parse_dates=["date"]) + +# ❌ BAD: using future data (this is intentional leakage) +df["future_return"] = df["close"].shift(-1) / df["close"] + +# Generate signals (cheating) +df["signal"] = df["future_return"] > 1 + +# Strategy returns +df["strategy_return"] = df["signal"] * df["future_return"] + +print("Leaky strategy output:") +print(df[["date", "symbol", "close", "future_return", "signal", "strategy_return"]]) \ No newline at end of file