Skip to main content

Overview

Feature engineering is the most critical component of machine learning for cryptocurrency prediction. This guide covers all the advanced techniques used in CryptoView Pro’s XGBoost models.
Key Principle: Good features capture market structure, momentum, volatility, and regime changes while avoiding data leakage.

Feature Categories

CryptoView Pro’s XGBoost model (models/xgboost_model.py:50-128) creates 50+ features across 12 categories:

1. Returns Features

Multi-Timeframe Returns

Returns capture price momentum across different horizons:
def create_return_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create return features across multiple timeframes
    """
    df = df.copy()
    
    # 1. Basic returns (percentage change)
    df['return_1'] = df['close'].pct_change(1)      # 1 hour
    df['return_4'] = df['close'].pct_change(4)      # 4 hours  
    df['return_24'] = df['close'].pct_change(24)    # 1 day
    df['return_168'] = df['close'].pct_change(168)  # 1 week
    
    # 2. Log returns (more stable for large changes)
    df['log_return_1'] = np.log(df['close'] / df['close'].shift(1))
    df['log_return_24'] = np.log(df['close'] / df['close'].shift(24))
    
    # 3. Cumulative returns
    df['cumulative_return_7d'] = (1 + df['return_1']).rolling(168).apply(
        lambda x: x.prod() - 1
    )
    
    # 4. Return acceleration (second derivative)
    df['return_acceleration'] = df['return_1'].diff()
    
    # 5. Return volatility (rolling std of returns)
    df['return_volatility_7d'] = df['return_1'].rolling(168).std()
    df['return_volatility_30d'] = df['return_1'].rolling(720).std()
    
    return df
Why it matters: Returns are scale-invariant and capture momentum better than absolute prices.

Return Ratios

# Risk-adjusted returns
df['sharpe_ratio_7d'] = (
    df['return_1'].rolling(168).mean() / 
    df['return_1'].rolling(168).std()
)

# Return skewness (asymmetry)
df['return_skew_7d'] = df['return_1'].rolling(168).skew()

# Return kurtosis (tail risk)
df['return_kurtosis_7d'] = df['return_1'].rolling(168).kurt()

2. Moving Average Features

Simple and Exponential Moving Averages

def create_moving_average_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create comprehensive moving average features
    """
    df = df.copy()
    
    # Simple Moving Averages (SMA)
    windows = [7, 14, 30, 50, 100, 200]
    for window in windows:
        df[f'sma_{window}'] = df['close'].rolling(window).mean()
        
        # Price to MA ratio (mean reversion indicator)
        df[f'price_to_sma_{window}'] = df['close'] / df[f'sma_{window}']
        
        # Distance from MA (normalized)
        df[f'distance_from_sma_{window}'] = (
            (df['close'] - df[f'sma_{window}']) / df[f'sma_{window}']
        )
    
    # Exponential Moving Averages (EMA) - more weight on recent data
    ema_spans = [12, 26, 50, 100, 200]
    for span in ema_spans:
        df[f'ema_{span}'] = df['close'].ewm(span=span, adjust=False).mean()
        df[f'price_to_ema_{span}'] = df['close'] / df[f'ema_{span}']
    
    # MA crossovers (powerful trading signals)
    df['sma_50_200_ratio'] = df['sma_50'] / df['sma_200']  # Golden/Death cross
    df['ema_12_26_ratio'] = df['ema_12'] / df['ema_26']    # Fast/slow ratio
    
    # MA slope (trend strength)
    df['sma_50_slope'] = df['sma_50'].pct_change(5)
    df['sma_200_slope'] = df['sma_200'].pct_change(10)
    
    return df
Key Insight: The ratio of price to MA is more informative than the absolute MA value.

Adaptive Moving Averages

def create_adaptive_ma(df: pd.DataFrame, fast_period: int = 2, 
                       slow_period: int = 30) -> pd.Series:
    """
    Kaufman's Adaptive Moving Average (KAMA)
    Adapts to market volatility - fast in trends, slow in ranges
    """
    # Calculate efficiency ratio
    change = abs(df['close'] - df['close'].shift(10))
    volatility = df['close'].diff().abs().rolling(10).sum()
    efficiency_ratio = change / volatility
    
    # Smoothing constants
    fastest = 2 / (fast_period + 1)
    slowest = 2 / (slow_period + 1)
    smooth = (efficiency_ratio * (fastest - slowest) + slowest) ** 2
    
    # Calculate KAMA
    kama = pd.Series(index=df.index, dtype=float)
    kama.iloc[0] = df['close'].iloc[0]
    
    for i in range(1, len(df)):
        kama.iloc[i] = kama.iloc[i-1] + smooth.iloc[i] * (df['close'].iloc[i] - kama.iloc[i-1])
    
    return kama

df['kama'] = create_adaptive_ma(df)
df['price_to_kama'] = df['close'] / df['kama']

3. Volatility Features

Rolling Volatility

def create_volatility_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Comprehensive volatility features
    """
    df = df.copy()
    
    # 1. Standard deviation of returns (classic volatility)
    for window in [7, 14, 30, 60]:
        df[f'volatility_{window}'] = df['return_1'].rolling(window).std()
    
    # 2. Parkinson volatility (uses high/low, more efficient)
    df['parkinson_vol'] = np.sqrt(
        (np.log(df['high'] / df['low']) ** 2).rolling(24).mean() / (4 * np.log(2))
    )
    
    # 3. Garman-Klass volatility (uses OHLC)
    df['gk_vol'] = np.sqrt(
        0.5 * (np.log(df['high'] / df['low']) ** 2).rolling(24).mean() -
        (2 * np.log(2) - 1) * (np.log(df['close'] / df['open']) ** 2).rolling(24).mean()
    )
    
    # 4. Volatility regime (low/medium/high)
    vol_rolling = df['return_1'].rolling(24).std()
    df['vol_regime'] = pd.qcut(
        vol_rolling, 
        q=3, 
        labels=[0, 1, 2],  # 0=low, 1=medium, 2=high
        duplicates='drop'
    ).astype(float)
    
    # 5. Volatility of volatility (second-order)
    df['vol_of_vol'] = df['volatility_24'].rolling(24).std()
    
    # 6. Realized vs Implied volatility spread (if you have options data)
    # df['vol_spread'] = df['implied_volatility'] - df['realized_volatility']
    
    return df

Bollinger Bands

def create_bollinger_features(df: pd.DataFrame, window: int = 20, 
                              num_std: float = 2) -> pd.DataFrame:
    """
    Bollinger Bands - volatility-based support/resistance
    """
    df = df.copy()
    
    # Calculate bands
    df['bb_middle'] = df['close'].rolling(window).mean()
    bb_std = df['close'].rolling(window).std()
    df['bb_upper'] = df['bb_middle'] + (bb_std * num_std)
    df['bb_lower'] = df['bb_middle'] - (bb_std * num_std)
    
    # Derived features
    df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / df['bb_middle']  # Normalized width
    df['bb_position'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])  # %B
    
    # Band touches (overbought/oversold)
    df['bb_upper_touch'] = (df['close'] >= df['bb_upper']).astype(int)
    df['bb_lower_touch'] = (df['close'] <= df['bb_lower']).astype(int)
    
    # Squeeze (low volatility before breakout)
    df['bb_squeeze'] = (df['bb_width'] < df['bb_width'].rolling(50).quantile(0.2)).astype(int)
    
    return df

4. Momentum Indicators

RSI (Relative Strength Index)

def create_rsi_features(df: pd.DataFrame, periods: list = [14, 28]) -> pd.DataFrame:
    """
    RSI and derived features
    """
    df = df.copy()
    
    for period in periods:
        # Calculate RSI
        delta = df['close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(period).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(period).mean()
        rs = gain / loss
        df[f'rsi_{period}'] = 100 - (100 / (1 + rs))
        
        # RSI derived features
        df[f'rsi_{period}_normalized'] = df[f'rsi_{period}'] / 100
        df[f'rsi_{period}_oversold'] = (df[f'rsi_{period}'] < 30).astype(int)
        df[f'rsi_{period}_overbought'] = (df[f'rsi_{period}'] > 70).astype(int)
        df[f'rsi_{period}_neutral'] = ((df[f'rsi_{period}'] >= 40) & 
                                       (df[f'rsi_{period}'] <= 60)).astype(int)
    
    # RSI divergence (price vs RSI)
    price_slope = df['close'].pct_change(14)
    rsi_slope = df['rsi_14'].pct_change(14)
    df['rsi_divergence'] = price_slope - (rsi_slope / 100)
    
    # RSI momentum
    df['rsi_momentum'] = df['rsi_14'].diff(3)
    
    return df

MACD (Moving Average Convergence Divergence)

def create_macd_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    MACD and signal features
    """
    df = df.copy()
    
    # Calculate MACD
    exp1 = df['close'].ewm(span=12, adjust=False).mean()
    exp2 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = exp1 - exp2
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    # Derived features
    df['macd_hist_slope'] = df['macd_hist'].diff()
    df['macd_crossover'] = ((df['macd'] > df['macd_signal']) & 
                            (df['macd'].shift(1) <= df['macd_signal'].shift(1))).astype(int)
    df['macd_crossunder'] = ((df['macd'] < df['macd_signal']) & 
                             (df['macd'].shift(1) >= df['macd_signal'].shift(1))).astype(int)
    
    # MACD momentum
    df['macd_momentum'] = df['macd'].diff(3)
    df['macd_acceleration'] = df['macd_momentum'].diff()
    
    # Zero-line crossings
    df['macd_above_zero'] = (df['macd'] > 0).astype(int)
    
    return df

5. Volume Features

def create_volume_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Volume-based features for confirmation
    """
    df = df.copy()
    
    if 'volume' not in df.columns:
        return df
    
    # 1. Volume moving averages
    df['volume_ma_7'] = df['volume'].rolling(7).mean()
    df['volume_ma_30'] = df['volume'].rolling(30).mean()
    
    # 2. Volume ratios (relative volume)
    df['volume_ratio_7'] = df['volume'] / df['volume_ma_7']
    df['volume_ratio_30'] = df['volume'] / df['volume_ma_30']
    
    # 3. Volume changes
    df['volume_change'] = df['volume'].pct_change()
    df['volume_acceleration'] = df['volume_change'].diff()
    
    # 4. Volume-weighted average price (VWAP)
    df['vwap_daily'] = (df['close'] * df['volume']).rolling(24).sum() / df['volume'].rolling(24).sum()
    df['price_to_vwap'] = df['close'] / df['vwap_daily']
    
    # 5. On-Balance Volume (OBV)
    df['obv'] = (np.sign(df['close'].diff()) * df['volume']).fillna(0).cumsum()
    df['obv_ma'] = df['obv'].rolling(20).mean()
    
    # 6. Volume trend
    df['volume_trend'] = df['volume'].rolling(20).apply(
        lambda x: np.polyfit(np.arange(len(x)), x, 1)[0]
    )
    
    # 7. Price-Volume correlation
    df['price_volume_corr'] = df['close'].rolling(20).corr(df['volume'])
    
    # 8. Volume spikes (unusual activity)
    volume_mean = df['volume'].rolling(50).mean()
    volume_std = df['volume'].rolling(50).std()
    df['volume_spike'] = ((df['volume'] - volume_mean) / volume_std > 2).astype(int)
    
    return df

6. Temporal Features

def create_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Time-based features for seasonality
    """
    df = df.copy()
    
    if not isinstance(df.index, pd.DatetimeIndex):
        return df
    
    # Basic time components
    df['hour'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek  # 0=Monday, 6=Sunday
    df['day_of_month'] = df.index.day
    df['month'] = df.index.month
    df['quarter'] = df.index.quarter
    
    # Cyclical encoding (preserves circular nature)
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    # Is weekend
    df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
    
    # Trading session (useful for crypto which trades 24/7)
    df['is_us_hours'] = ((df['hour'] >= 13) & (df['hour'] <= 21)).astype(int)  # UTC
    df['is_asia_hours'] = ((df['hour'] >= 0) & (df['hour'] <= 8)).astype(int)
    df['is_europe_hours'] = ((df['hour'] >= 7) & (df['hour'] <= 15)).astype(int)
    
    return df

7. Lag Features

def create_lag_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Historical price lags - the model 'remembers' past values
    """
    df = df.copy()
    
    # Price lags
    lags = [1, 2, 3, 6, 12, 24, 48, 168]  # 1h, 2h, 3h, 6h, 12h, 1d, 2d, 1w
    for lag in lags:
        df[f'close_lag_{lag}'] = df['close'].shift(lag)
        df[f'return_lag_{lag}'] = df['return_1'].shift(lag)
        
        # Relative to current (mean reversion signal)
        df[f'close_diff_lag_{lag}'] = df['close'] - df['close'].shift(lag)
        df[f'close_ratio_lag_{lag}'] = df['close'] / df['close'].shift(lag)
    
    # Volume lags
    if 'volume' in df.columns:
        for lag in [1, 3, 24]:
            df[f'volume_lag_{lag}'] = df['volume'].shift(lag)
    
    # Indicator lags
    if 'rsi' in df.columns:
        df['rsi_lag_1'] = df['rsi'].shift(1)
        df['rsi_change'] = df['rsi'] - df['rsi_lag_1']
    
    return df

8. Advanced Pattern Features

Price Patterns

def create_pattern_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Candlestick patterns and fractals
    """
    df = df.copy()
    
    # 1. Candlestick body and shadows
    df['candle_body'] = abs(df['close'] - df['open'])
    df['candle_upper_shadow'] = df['high'] - df[['close', 'open']].max(axis=1)
    df['candle_lower_shadow'] = df[['close', 'open']].min(axis=1) - df['low']
    df['candle_total_range'] = df['high'] - df['low']
    
    # 2. Body to range ratio
    df['body_to_range'] = df['candle_body'] / df['candle_total_range']
    
    # 3. Doji (indecision)
    df['is_doji'] = (df['body_to_range'] < 0.1).astype(int)
    
    # 4. Hammer / Shooting star
    df['is_hammer'] = ((df['candle_lower_shadow'] > 2 * df['candle_body']) & 
                       (df['close'] > df['open'])).astype(int)
    df['is_shooting_star'] = ((df['candle_upper_shadow'] > 2 * df['candle_body']) & 
                              (df['close'] < df['open'])).astype(int)
    
    # 5. Engulfing patterns
    df['bullish_engulfing'] = ((df['close'] > df['open']) & 
                               (df['close'].shift(1) < df['open'].shift(1)) &
                               (df['open'] < df['close'].shift(1)) &
                               (df['close'] > df['open'].shift(1))).astype(int)
    
    # 6. Local extrema (fractals)
    df['is_local_max'] = ((df['high'] > df['high'].shift(1)) & 
                          (df['high'] > df['high'].shift(-1))).astype(int)
    df['is_local_min'] = ((df['low'] < df['low'].shift(1)) & 
                          (df['low'] < df['low'].shift(-1))).astype(int)
    
    # 7. Support/Resistance proximity
    df['distance_to_24h_high'] = (df['high'].rolling(24).max() - df['close']) / df['close']
    df['distance_to_24h_low'] = (df['close'] - df['low'].rolling(24).min()) / df['close']
    
    return df

Trend Features

def create_trend_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Trend direction and strength
    """
    df = df.copy()
    
    # 1. Linear regression trend
    for window in [7, 14, 30]:
        df[f'trend_slope_{window}'] = df['close'].rolling(window).apply(
            lambda x: np.polyfit(np.arange(len(x)), x, 1)[0] if len(x) == window else np.nan
        )
    
    # 2. ADX (Average Directional Index) - trend strength
    high_diff = df['high'].diff()
    low_diff = -df['low'].diff()
    
    pos_dm = high_diff.where((high_diff > low_diff) & (high_diff > 0), 0)
    neg_dm = low_diff.where((low_diff > high_diff) & (low_diff > 0), 0)
    
    tr = pd.concat([
        df['high'] - df['low'],
        abs(df['high'] - df['close'].shift()),
        abs(df['low'] - df['close'].shift())
    ], axis=1).max(axis=1)
    
    atr = tr.rolling(14).mean()
    pos_di = 100 * (pos_dm.rolling(14).mean() / atr)
    neg_di = 100 * (neg_dm.rolling(14).mean() / atr)
    
    df['adx'] = 100 * abs(pos_di - neg_di) / (pos_di + neg_di)
    df['adx'] = df['adx'].rolling(14).mean()
    
    # 3. Parabolic SAR (stop and reverse)
    # Simplified version
    df['sar'] = df['close'].ewm(span=20).mean()
    df['price_above_sar'] = (df['close'] > df['sar']).astype(int)
    
    return df

9. Feature Selection

Importance-Based Selection

from sklearn.feature_selection import SelectKBest, f_regression
import xgboost as xgb

def select_best_features(X, y, k=30):
    """
    Select top K features using mutual information
    """
    selector = SelectKBest(score_func=f_regression, k=k)
    selector.fit(X, y)
    
    # Get feature scores
    scores = pd.DataFrame({
        'feature': X.columns,
        'score': selector.scores_
    }).sort_values('score', ascending=False)
    
    selected_features = scores.head(k)['feature'].tolist()
    return X[selected_features], selected_features

def get_xgboost_feature_importance(model, feature_names):
    """
    Get XGBoost feature importance
    """
    importance = model.feature_importances_
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    return importance_df

Correlation-Based Filtering

def remove_correlated_features(X, threshold=0.95):
    """
    Remove highly correlated features to reduce redundancy
    """
    corr_matrix = X.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    return X.drop(columns=to_drop), to_drop

10. Feature Scaling

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

def scale_features(X_train, X_test, method='minmax'):
    """
    Scale features - important for some models (but XGBoost is tree-based, less critical)
    """
    if method == 'minmax':
        scaler = MinMaxScaler()  # Scales to [0, 1]
    elif method == 'standard':
        scaler = StandardScaler()  # Zero mean, unit variance
    elif method == 'robust':
        scaler = RobustScaler()  # Uses median and IQR, robust to outliers
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, scaler

Best Practices

Avoid Data Leakage: Never use future information in features. Example: Don’t use df['close'].shift(-1) as a feature!
  1. Start simple: Begin with basic returns, MAs, and momentum
  2. Add complexity: Gradually add advanced features
  3. Test incremental: Evaluate each feature’s impact
  4. Remove redundant: Drop highly correlated features
  5. Monitor performance: Track feature importance over time
  • Rolling windows create NaN at the start
  • Use .fillna(method='bfill') or .dropna() after feature creation
  • Never use .fillna(0) unless 0 is meaningful
  • Document which features cause NaN and why
  • Never shuffle: Maintain temporal order
  • Use lags carefully: Too many lags = overfitting
  • Stationarity: Crypto returns are more stationary than prices
  • Regime changes: Markets change, retrain regularly
  • Crypto-specific: Consider exchange hours, whale movements
  • Technical analysis: RSI, MACD are proven indicators
  • Market microstructure: Volume, bid-ask spread matter
  • Sentiment: Twitter/Reddit sentiment can be predictive

Complete Feature Engineering Pipeline

import pandas as pd
import numpy as np

class CryptoFeatureEngineer:
    """
    Complete feature engineering pipeline for cryptocurrency prediction
    """
    
    def __init__(self):
        self.scaler = None
        self.selected_features = None
    
    def create_all_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Create all features in one pipeline
        """
        df = df.copy()
        
        # 1. Returns
        df = create_return_features(df)
        
        # 2. Moving averages
        df = create_moving_average_features(df)
        
        # 3. Volatility
        df = create_volatility_features(df)
        df = create_bollinger_features(df)
        
        # 4. Momentum indicators
        df = create_rsi_features(df)
        df = create_macd_features(df)
        
        # 5. Volume
        df = create_volume_features(df)
        
        # 6. Temporal
        df = create_temporal_features(df)
        
        # 7. Lags
        df = create_lag_features(df)
        
        # 8. Patterns
        df = create_pattern_features(df)
        df = create_trend_features(df)
        
        # 9. Target (next period price)
        df['target'] = df['close'].shift(-1)
        
        # 10. Clean up
        df = df.dropna()
        
        return df
    
    def prepare_for_training(self, df: pd.DataFrame, target_col: str = 'target'):
        """
        Prepare features for training
        """
        # Separate features and target
        exclude_cols = [target_col, 'open', 'high', 'low', 'close', 'volume']
        feature_cols = [col for col in df.columns if col not in exclude_cols]
        
        X = df[feature_cols]
        y = df[target_col]
        
        # Remove highly correlated features
        X, dropped = remove_correlated_features(X, threshold=0.95)
        print(f"Dropped {len(dropped)} correlated features")
        
        # Feature selection (optional)
        # X, self.selected_features = select_best_features(X, y, k=50)
        
        return X, y

# Usage
engineer = CryptoFeatureEngineer()
df_with_features = engineer.create_all_features(df)
X, y = engineer.prepare_for_training(df_with_features)

print(f"Created {X.shape[1]} features from raw OHLCV data")

Testing Feature Quality

def evaluate_feature_predictive_power(X, y, feature_name):
    """
    Evaluate single feature's predictive power
    """
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score
    
    X_feature = X[feature_name].values.reshape(-1, 1)
    
    # Remove NaN
    mask = ~np.isnan(X_feature.flatten())
    X_clean = X_feature[mask]
    y_clean = y[mask]
    
    # Fit simple linear regression
    model = LinearRegression()
    model.fit(X_clean, y_clean)
    y_pred = model.predict(X_clean)
    
    # R² score
    r2 = r2_score(y_clean, y_pred)
    
    return r2

# Test all features
for feature in X.columns:
    r2 = evaluate_feature_predictive_power(X, y, feature)
    if r2 > 0.1:  # Only show features with R² > 0.1
        print(f"{feature}: R² = {r2:.4f}")

Next Steps

Custom Models

Use these features in custom prediction models

Backtesting

Test your features with historical data

XGBoost Reference

See the full XGBoost model implementation

Production Tips

Deploy feature engineering pipelines

Build docs developers (and LLMs) love