Overview
Feature engineering is the most critical component of machine learning for cryptocurrency prediction. This guide covers all the advanced techniques used in CryptoView Pro’s XGBoost models.Key Principle: Good features capture market structure, momentum, volatility, and regime changes while avoiding data leakage.
Feature Categories
CryptoView Pro’s XGBoost model (models/xgboost_model.py:50-128) creates 50+ features across 12 categories:
1. Returns Features
Multi-Timeframe Returns
Returns capture price momentum across different horizons:def create_return_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Create return features across multiple timeframes
"""
df = df.copy()
# 1. Basic returns (percentage change)
df['return_1'] = df['close'].pct_change(1) # 1 hour
df['return_4'] = df['close'].pct_change(4) # 4 hours
df['return_24'] = df['close'].pct_change(24) # 1 day
df['return_168'] = df['close'].pct_change(168) # 1 week
# 2. Log returns (more stable for large changes)
df['log_return_1'] = np.log(df['close'] / df['close'].shift(1))
df['log_return_24'] = np.log(df['close'] / df['close'].shift(24))
# 3. Cumulative returns
df['cumulative_return_7d'] = (1 + df['return_1']).rolling(168).apply(
lambda x: x.prod() - 1
)
# 4. Return acceleration (second derivative)
df['return_acceleration'] = df['return_1'].diff()
# 5. Return volatility (rolling std of returns)
df['return_volatility_7d'] = df['return_1'].rolling(168).std()
df['return_volatility_30d'] = df['return_1'].rolling(720).std()
return df
Return Ratios
# Risk-adjusted returns
df['sharpe_ratio_7d'] = (
df['return_1'].rolling(168).mean() /
df['return_1'].rolling(168).std()
)
# Return skewness (asymmetry)
df['return_skew_7d'] = df['return_1'].rolling(168).skew()
# Return kurtosis (tail risk)
df['return_kurtosis_7d'] = df['return_1'].rolling(168).kurt()
2. Moving Average Features
Simple and Exponential Moving Averages
def create_moving_average_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Create comprehensive moving average features
"""
df = df.copy()
# Simple Moving Averages (SMA)
windows = [7, 14, 30, 50, 100, 200]
for window in windows:
df[f'sma_{window}'] = df['close'].rolling(window).mean()
# Price to MA ratio (mean reversion indicator)
df[f'price_to_sma_{window}'] = df['close'] / df[f'sma_{window}']
# Distance from MA (normalized)
df[f'distance_from_sma_{window}'] = (
(df['close'] - df[f'sma_{window}']) / df[f'sma_{window}']
)
# Exponential Moving Averages (EMA) - more weight on recent data
ema_spans = [12, 26, 50, 100, 200]
for span in ema_spans:
df[f'ema_{span}'] = df['close'].ewm(span=span, adjust=False).mean()
df[f'price_to_ema_{span}'] = df['close'] / df[f'ema_{span}']
# MA crossovers (powerful trading signals)
df['sma_50_200_ratio'] = df['sma_50'] / df['sma_200'] # Golden/Death cross
df['ema_12_26_ratio'] = df['ema_12'] / df['ema_26'] # Fast/slow ratio
# MA slope (trend strength)
df['sma_50_slope'] = df['sma_50'].pct_change(5)
df['sma_200_slope'] = df['sma_200'].pct_change(10)
return df
Adaptive Moving Averages
def create_adaptive_ma(df: pd.DataFrame, fast_period: int = 2,
slow_period: int = 30) -> pd.Series:
"""
Kaufman's Adaptive Moving Average (KAMA)
Adapts to market volatility - fast in trends, slow in ranges
"""
# Calculate efficiency ratio
change = abs(df['close'] - df['close'].shift(10))
volatility = df['close'].diff().abs().rolling(10).sum()
efficiency_ratio = change / volatility
# Smoothing constants
fastest = 2 / (fast_period + 1)
slowest = 2 / (slow_period + 1)
smooth = (efficiency_ratio * (fastest - slowest) + slowest) ** 2
# Calculate KAMA
kama = pd.Series(index=df.index, dtype=float)
kama.iloc[0] = df['close'].iloc[0]
for i in range(1, len(df)):
kama.iloc[i] = kama.iloc[i-1] + smooth.iloc[i] * (df['close'].iloc[i] - kama.iloc[i-1])
return kama
df['kama'] = create_adaptive_ma(df)
df['price_to_kama'] = df['close'] / df['kama']
3. Volatility Features
Rolling Volatility
def create_volatility_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Comprehensive volatility features
"""
df = df.copy()
# 1. Standard deviation of returns (classic volatility)
for window in [7, 14, 30, 60]:
df[f'volatility_{window}'] = df['return_1'].rolling(window).std()
# 2. Parkinson volatility (uses high/low, more efficient)
df['parkinson_vol'] = np.sqrt(
(np.log(df['high'] / df['low']) ** 2).rolling(24).mean() / (4 * np.log(2))
)
# 3. Garman-Klass volatility (uses OHLC)
df['gk_vol'] = np.sqrt(
0.5 * (np.log(df['high'] / df['low']) ** 2).rolling(24).mean() -
(2 * np.log(2) - 1) * (np.log(df['close'] / df['open']) ** 2).rolling(24).mean()
)
# 4. Volatility regime (low/medium/high)
vol_rolling = df['return_1'].rolling(24).std()
df['vol_regime'] = pd.qcut(
vol_rolling,
q=3,
labels=[0, 1, 2], # 0=low, 1=medium, 2=high
duplicates='drop'
).astype(float)
# 5. Volatility of volatility (second-order)
df['vol_of_vol'] = df['volatility_24'].rolling(24).std()
# 6. Realized vs Implied volatility spread (if you have options data)
# df['vol_spread'] = df['implied_volatility'] - df['realized_volatility']
return df
Bollinger Bands
def create_bollinger_features(df: pd.DataFrame, window: int = 20,
num_std: float = 2) -> pd.DataFrame:
"""
Bollinger Bands - volatility-based support/resistance
"""
df = df.copy()
# Calculate bands
df['bb_middle'] = df['close'].rolling(window).mean()
bb_std = df['close'].rolling(window).std()
df['bb_upper'] = df['bb_middle'] + (bb_std * num_std)
df['bb_lower'] = df['bb_middle'] - (bb_std * num_std)
# Derived features
df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / df['bb_middle'] # Normalized width
df['bb_position'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower']) # %B
# Band touches (overbought/oversold)
df['bb_upper_touch'] = (df['close'] >= df['bb_upper']).astype(int)
df['bb_lower_touch'] = (df['close'] <= df['bb_lower']).astype(int)
# Squeeze (low volatility before breakout)
df['bb_squeeze'] = (df['bb_width'] < df['bb_width'].rolling(50).quantile(0.2)).astype(int)
return df
4. Momentum Indicators
RSI (Relative Strength Index)
def create_rsi_features(df: pd.DataFrame, periods: list = [14, 28]) -> pd.DataFrame:
"""
RSI and derived features
"""
df = df.copy()
for period in periods:
# Calculate RSI
delta = df['close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(period).mean()
loss = (-delta.where(delta < 0, 0)).rolling(period).mean()
rs = gain / loss
df[f'rsi_{period}'] = 100 - (100 / (1 + rs))
# RSI derived features
df[f'rsi_{period}_normalized'] = df[f'rsi_{period}'] / 100
df[f'rsi_{period}_oversold'] = (df[f'rsi_{period}'] < 30).astype(int)
df[f'rsi_{period}_overbought'] = (df[f'rsi_{period}'] > 70).astype(int)
df[f'rsi_{period}_neutral'] = ((df[f'rsi_{period}'] >= 40) &
(df[f'rsi_{period}'] <= 60)).astype(int)
# RSI divergence (price vs RSI)
price_slope = df['close'].pct_change(14)
rsi_slope = df['rsi_14'].pct_change(14)
df['rsi_divergence'] = price_slope - (rsi_slope / 100)
# RSI momentum
df['rsi_momentum'] = df['rsi_14'].diff(3)
return df
MACD (Moving Average Convergence Divergence)
def create_macd_features(df: pd.DataFrame) -> pd.DataFrame:
"""
MACD and signal features
"""
df = df.copy()
# Calculate MACD
exp1 = df['close'].ewm(span=12, adjust=False).mean()
exp2 = df['close'].ewm(span=26, adjust=False).mean()
df['macd'] = exp1 - exp2
df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
df['macd_hist'] = df['macd'] - df['macd_signal']
# Derived features
df['macd_hist_slope'] = df['macd_hist'].diff()
df['macd_crossover'] = ((df['macd'] > df['macd_signal']) &
(df['macd'].shift(1) <= df['macd_signal'].shift(1))).astype(int)
df['macd_crossunder'] = ((df['macd'] < df['macd_signal']) &
(df['macd'].shift(1) >= df['macd_signal'].shift(1))).astype(int)
# MACD momentum
df['macd_momentum'] = df['macd'].diff(3)
df['macd_acceleration'] = df['macd_momentum'].diff()
# Zero-line crossings
df['macd_above_zero'] = (df['macd'] > 0).astype(int)
return df
5. Volume Features
def create_volume_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Volume-based features for confirmation
"""
df = df.copy()
if 'volume' not in df.columns:
return df
# 1. Volume moving averages
df['volume_ma_7'] = df['volume'].rolling(7).mean()
df['volume_ma_30'] = df['volume'].rolling(30).mean()
# 2. Volume ratios (relative volume)
df['volume_ratio_7'] = df['volume'] / df['volume_ma_7']
df['volume_ratio_30'] = df['volume'] / df['volume_ma_30']
# 3. Volume changes
df['volume_change'] = df['volume'].pct_change()
df['volume_acceleration'] = df['volume_change'].diff()
# 4. Volume-weighted average price (VWAP)
df['vwap_daily'] = (df['close'] * df['volume']).rolling(24).sum() / df['volume'].rolling(24).sum()
df['price_to_vwap'] = df['close'] / df['vwap_daily']
# 5. On-Balance Volume (OBV)
df['obv'] = (np.sign(df['close'].diff()) * df['volume']).fillna(0).cumsum()
df['obv_ma'] = df['obv'].rolling(20).mean()
# 6. Volume trend
df['volume_trend'] = df['volume'].rolling(20).apply(
lambda x: np.polyfit(np.arange(len(x)), x, 1)[0]
)
# 7. Price-Volume correlation
df['price_volume_corr'] = df['close'].rolling(20).corr(df['volume'])
# 8. Volume spikes (unusual activity)
volume_mean = df['volume'].rolling(50).mean()
volume_std = df['volume'].rolling(50).std()
df['volume_spike'] = ((df['volume'] - volume_mean) / volume_std > 2).astype(int)
return df
6. Temporal Features
def create_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Time-based features for seasonality
"""
df = df.copy()
if not isinstance(df.index, pd.DatetimeIndex):
return df
# Basic time components
df['hour'] = df.index.hour
df['day_of_week'] = df.index.dayofweek # 0=Monday, 6=Sunday
df['day_of_month'] = df.index.day
df['month'] = df.index.month
df['quarter'] = df.index.quarter
# Cyclical encoding (preserves circular nature)
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
# Is weekend
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
# Trading session (useful for crypto which trades 24/7)
df['is_us_hours'] = ((df['hour'] >= 13) & (df['hour'] <= 21)).astype(int) # UTC
df['is_asia_hours'] = ((df['hour'] >= 0) & (df['hour'] <= 8)).astype(int)
df['is_europe_hours'] = ((df['hour'] >= 7) & (df['hour'] <= 15)).astype(int)
return df
7. Lag Features
def create_lag_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Historical price lags - the model 'remembers' past values
"""
df = df.copy()
# Price lags
lags = [1, 2, 3, 6, 12, 24, 48, 168] # 1h, 2h, 3h, 6h, 12h, 1d, 2d, 1w
for lag in lags:
df[f'close_lag_{lag}'] = df['close'].shift(lag)
df[f'return_lag_{lag}'] = df['return_1'].shift(lag)
# Relative to current (mean reversion signal)
df[f'close_diff_lag_{lag}'] = df['close'] - df['close'].shift(lag)
df[f'close_ratio_lag_{lag}'] = df['close'] / df['close'].shift(lag)
# Volume lags
if 'volume' in df.columns:
for lag in [1, 3, 24]:
df[f'volume_lag_{lag}'] = df['volume'].shift(lag)
# Indicator lags
if 'rsi' in df.columns:
df['rsi_lag_1'] = df['rsi'].shift(1)
df['rsi_change'] = df['rsi'] - df['rsi_lag_1']
return df
8. Advanced Pattern Features
Price Patterns
def create_pattern_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Candlestick patterns and fractals
"""
df = df.copy()
# 1. Candlestick body and shadows
df['candle_body'] = abs(df['close'] - df['open'])
df['candle_upper_shadow'] = df['high'] - df[['close', 'open']].max(axis=1)
df['candle_lower_shadow'] = df[['close', 'open']].min(axis=1) - df['low']
df['candle_total_range'] = df['high'] - df['low']
# 2. Body to range ratio
df['body_to_range'] = df['candle_body'] / df['candle_total_range']
# 3. Doji (indecision)
df['is_doji'] = (df['body_to_range'] < 0.1).astype(int)
# 4. Hammer / Shooting star
df['is_hammer'] = ((df['candle_lower_shadow'] > 2 * df['candle_body']) &
(df['close'] > df['open'])).astype(int)
df['is_shooting_star'] = ((df['candle_upper_shadow'] > 2 * df['candle_body']) &
(df['close'] < df['open'])).astype(int)
# 5. Engulfing patterns
df['bullish_engulfing'] = ((df['close'] > df['open']) &
(df['close'].shift(1) < df['open'].shift(1)) &
(df['open'] < df['close'].shift(1)) &
(df['close'] > df['open'].shift(1))).astype(int)
# 6. Local extrema (fractals)
df['is_local_max'] = ((df['high'] > df['high'].shift(1)) &
(df['high'] > df['high'].shift(-1))).astype(int)
df['is_local_min'] = ((df['low'] < df['low'].shift(1)) &
(df['low'] < df['low'].shift(-1))).astype(int)
# 7. Support/Resistance proximity
df['distance_to_24h_high'] = (df['high'].rolling(24).max() - df['close']) / df['close']
df['distance_to_24h_low'] = (df['close'] - df['low'].rolling(24).min()) / df['close']
return df
Trend Features
def create_trend_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Trend direction and strength
"""
df = df.copy()
# 1. Linear regression trend
for window in [7, 14, 30]:
df[f'trend_slope_{window}'] = df['close'].rolling(window).apply(
lambda x: np.polyfit(np.arange(len(x)), x, 1)[0] if len(x) == window else np.nan
)
# 2. ADX (Average Directional Index) - trend strength
high_diff = df['high'].diff()
low_diff = -df['low'].diff()
pos_dm = high_diff.where((high_diff > low_diff) & (high_diff > 0), 0)
neg_dm = low_diff.where((low_diff > high_diff) & (low_diff > 0), 0)
tr = pd.concat([
df['high'] - df['low'],
abs(df['high'] - df['close'].shift()),
abs(df['low'] - df['close'].shift())
], axis=1).max(axis=1)
atr = tr.rolling(14).mean()
pos_di = 100 * (pos_dm.rolling(14).mean() / atr)
neg_di = 100 * (neg_dm.rolling(14).mean() / atr)
df['adx'] = 100 * abs(pos_di - neg_di) / (pos_di + neg_di)
df['adx'] = df['adx'].rolling(14).mean()
# 3. Parabolic SAR (stop and reverse)
# Simplified version
df['sar'] = df['close'].ewm(span=20).mean()
df['price_above_sar'] = (df['close'] > df['sar']).astype(int)
return df
9. Feature Selection
Importance-Based Selection
from sklearn.feature_selection import SelectKBest, f_regression
import xgboost as xgb
def select_best_features(X, y, k=30):
"""
Select top K features using mutual information
"""
selector = SelectKBest(score_func=f_regression, k=k)
selector.fit(X, y)
# Get feature scores
scores = pd.DataFrame({
'feature': X.columns,
'score': selector.scores_
}).sort_values('score', ascending=False)
selected_features = scores.head(k)['feature'].tolist()
return X[selected_features], selected_features
def get_xgboost_feature_importance(model, feature_names):
"""
Get XGBoost feature importance
"""
importance = model.feature_importances_
importance_df = pd.DataFrame({
'feature': feature_names,
'importance': importance
}).sort_values('importance', ascending=False)
return importance_df
Correlation-Based Filtering
def remove_correlated_features(X, threshold=0.95):
"""
Remove highly correlated features to reduce redundancy
"""
corr_matrix = X.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
return X.drop(columns=to_drop), to_drop
10. Feature Scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
def scale_features(X_train, X_test, method='minmax'):
"""
Scale features - important for some models (but XGBoost is tree-based, less critical)
"""
if method == 'minmax':
scaler = MinMaxScaler() # Scales to [0, 1]
elif method == 'standard':
scaler = StandardScaler() # Zero mean, unit variance
elif method == 'robust':
scaler = RobustScaler() # Uses median and IQR, robust to outliers
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
return X_train_scaled, X_test_scaled, scaler
Best Practices
Avoid Data Leakage: Never use future information in features. Example: Don’t use
df['close'].shift(-1) as a feature!1. Feature Engineering Workflow
1. Feature Engineering Workflow
- Start simple: Begin with basic returns, MAs, and momentum
- Add complexity: Gradually add advanced features
- Test incremental: Evaluate each feature’s impact
- Remove redundant: Drop highly correlated features
- Monitor performance: Track feature importance over time
2. Handling NaN Values
2. Handling NaN Values
- Rolling windows create NaN at the start
- Use
.fillna(method='bfill')or.dropna()after feature creation - Never use
.fillna(0)unless 0 is meaningful - Document which features cause NaN and why
3. Time-Series Considerations
3. Time-Series Considerations
- Never shuffle: Maintain temporal order
- Use lags carefully: Too many lags = overfitting
- Stationarity: Crypto returns are more stationary than prices
- Regime changes: Markets change, retrain regularly
4. Domain Knowledge
4. Domain Knowledge
- Crypto-specific: Consider exchange hours, whale movements
- Technical analysis: RSI, MACD are proven indicators
- Market microstructure: Volume, bid-ask spread matter
- Sentiment: Twitter/Reddit sentiment can be predictive
Complete Feature Engineering Pipeline
import pandas as pd
import numpy as np
class CryptoFeatureEngineer:
"""
Complete feature engineering pipeline for cryptocurrency prediction
"""
def __init__(self):
self.scaler = None
self.selected_features = None
def create_all_features(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Create all features in one pipeline
"""
df = df.copy()
# 1. Returns
df = create_return_features(df)
# 2. Moving averages
df = create_moving_average_features(df)
# 3. Volatility
df = create_volatility_features(df)
df = create_bollinger_features(df)
# 4. Momentum indicators
df = create_rsi_features(df)
df = create_macd_features(df)
# 5. Volume
df = create_volume_features(df)
# 6. Temporal
df = create_temporal_features(df)
# 7. Lags
df = create_lag_features(df)
# 8. Patterns
df = create_pattern_features(df)
df = create_trend_features(df)
# 9. Target (next period price)
df['target'] = df['close'].shift(-1)
# 10. Clean up
df = df.dropna()
return df
def prepare_for_training(self, df: pd.DataFrame, target_col: str = 'target'):
"""
Prepare features for training
"""
# Separate features and target
exclude_cols = [target_col, 'open', 'high', 'low', 'close', 'volume']
feature_cols = [col for col in df.columns if col not in exclude_cols]
X = df[feature_cols]
y = df[target_col]
# Remove highly correlated features
X, dropped = remove_correlated_features(X, threshold=0.95)
print(f"Dropped {len(dropped)} correlated features")
# Feature selection (optional)
# X, self.selected_features = select_best_features(X, y, k=50)
return X, y
# Usage
engineer = CryptoFeatureEngineer()
df_with_features = engineer.create_all_features(df)
X, y = engineer.prepare_for_training(df_with_features)
print(f"Created {X.shape[1]} features from raw OHLCV data")
Testing Feature Quality
def evaluate_feature_predictive_power(X, y, feature_name):
"""
Evaluate single feature's predictive power
"""
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
X_feature = X[feature_name].values.reshape(-1, 1)
# Remove NaN
mask = ~np.isnan(X_feature.flatten())
X_clean = X_feature[mask]
y_clean = y[mask]
# Fit simple linear regression
model = LinearRegression()
model.fit(X_clean, y_clean)
y_pred = model.predict(X_clean)
# R² score
r2 = r2_score(y_clean, y_pred)
return r2
# Test all features
for feature in X.columns:
r2 = evaluate_feature_predictive_power(X, y, feature)
if r2 > 0.1: # Only show features with R² > 0.1
print(f"{feature}: R² = {r2:.4f}")
Next Steps
Custom Models
Use these features in custom prediction models
Backtesting
Test your features with historical data
XGBoost Reference
See the full XGBoost model implementation
Production Tips
Deploy feature engineering pipelines