Skip to main content
The EDL Pipeline produces all_stocks_fundamental_analysis.json.gz, a compressed JSON file containing 86 fields for ~2,775 NSE stocks. This guide shows how to load and work with this data.

Output File Structure

Primary Output

File: all_stocks_fundamental_analysis.json.gz
Size: ~2 MB (compressed), ~50 MB (uncompressed)
Format: Gzip-compressed JSON array
Records: ~2,775 stocks × 86 fields each

Secondary Outputs

  • sector_analytics.json.gz - Sector-level performance metrics
  • market_breadth.json.gz - Daily market breadth indicators
  • ohlcv_data/*.csv - Historical OHLCV data per stock

Loading the Output

Python (Standard Library)

import gzip
import json

# Load compressed JSON
with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt', encoding='utf-8') as f:
    stocks = json.load(f)

print(f"Loaded {len(stocks)} stocks")

# Access first stock
reliance = stocks[0]
print(f"{reliance['Symbol']}: ₹{reliance['Stock Price(₹)']}")
print(f"Market Cap: ₹{reliance['Market Cap(Cr.)']} Cr.")
print(f"P/E: {reliance['P/E']}")

pandas DataFrame

import pandas as pd
import gzip
import json

# Load into DataFrame
with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt') as f:
    stocks = json.load(f)

df = pd.DataFrame(stocks)

# Display basic info
print(df.info())
print(df[['Symbol', 'Stock Price(₹)', 'Market Cap(Cr.)', 'P/E']].head(10))

Command Line (jq)

# View first stock
zcat all_stocks_fundamental_analysis.json.gz | jq '.[0]'

# Filter IT sector stocks
zcat all_stocks_fundamental_analysis.json.gz | \
  jq '.[] | select(.Sector == "IT")'

# Get top 10 by market cap
zcat all_stocks_fundamental_analysis.json.gz | \
  jq 'sort_by(-."Market Cap(Cr.)") | .[0:10] | .[] | {Symbol, "Market Cap": ."Market Cap(Cr.)"}'

Common Use Cases

1. Screening Stocks by Criteria

import gzip
import json

with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt') as f:
    stocks = json.load(f)

# Screen for: P/E < 20, ROE > 15%, Market Cap > 1000 Cr.
screened = [
    s for s in stocks
    if s.get('P/E', 0) > 0 and s['P/E'] < 20
    and s.get('ROE(%)', 0) > 15
    and s.get('Market Cap(Cr.)', 0) > 1000
]

for stock in screened[:10]:
    print(f"{stock['Symbol']:15} P/E: {stock['P/E']:6.2f} ROE: {stock['ROE(%)']}% MCap: {stock['Market Cap(Cr.)']}")

2. Sector Analysis

import pandas as pd
import gzip
import json

with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt') as f:
    df = pd.DataFrame(json.load(f))

# Average P/E by sector
sector_pe = df.groupby('Sector')['P/E'].agg(['mean', 'median', 'count'])
print(sector_pe.sort_values('mean', ascending=False))

# Top sectors by average market cap
sector_mcap = df.groupby('Sector')['Market Cap(Cr.)'].mean().sort_values(ascending=False)
print(sector_mcap.head(10))

3. Event-Based Screening

import gzip
import json

with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt') as f:
    stocks = json.load(f)

# Find stocks with recent results (📊 marker)
recent_results = [
    s for s in stocks
    if '📊' in s.get('Event Markers', '')
]

print(f"Found {len(recent_results)} stocks with recent results:")
for stock in recent_results[:20]:
    print(f"{stock['Symbol']:15} Returns since Earnings: {stock.get('Returns since Earnings(%)', 0)}%")

4. Technical Analysis Filtering

import gzip
import json

with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt') as f:
    stocks = json.load(f)

# Stocks above all major SMAs with strong momentum
strong_momentum = [
    s for s in stocks
    if 'SMA 20: Above' in s.get('SMA Status', '')
    and 'SMA 50: Above' in s.get('SMA Status', '')
    and 'SMA 200: Above' in s.get('SMA Status', '')
    and s.get('RSI (14)', 0) > 50
    and s.get('1 Month Returns(%)', 0) > 5
]

print(f"Found {len(strong_momentum)} stocks in strong uptrend:")
for stock in strong_momentum[:15]:
    print(f"{stock['Symbol']:15} 1M: {stock['1 Month Returns(%)']:6.2f}% RSI: {stock['RSI (14)']}")

Working with OHLCV Data

Load Individual Stock OHLCV

import pandas as pd
import os

OHLCV_DIR = 'ohlcv_data'

# Load RELIANCE OHLCV
df = pd.read_csv(f"{OHLCV_DIR}/RELIANCE.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

print(df.tail())
print(f"\nTotal trading days: {len(df)}")
print(f"Date range: {df.index.min()} to {df.index.max()}")

Calculate Custom Indicators

import pandas as pd

df = pd.read_csv('ohlcv_data/RELIANCE.csv')
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# Calculate 20-day SMA
df['SMA_20'] = df['Close'].rolling(window=20).mean()

# Calculate daily returns
df['Daily_Return'] = df['Close'].pct_change() * 100

# Calculate volatility (20-day standard deviation of returns)
df['Volatility'] = df['Daily_Return'].rolling(window=20).std()

print(df[['Close', 'SMA_20', 'Daily_Return', 'Volatility']].tail(10))

Exporting to Different Formats

Excel

import pandas as pd
import gzip
import json

with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt') as f:
    df = pd.DataFrame(json.load(f))

# Export to Excel with multiple sheets
with pd.ExcelWriter('stock_analysis.xlsx', engine='openpyxl') as writer:
    # All stocks
    df.to_excel(writer, sheet_name='All Stocks', index=False)
    
    # Top 100 by market cap
    df.nlargest(100, 'Market Cap(Cr.)').to_excel(writer, sheet_name='Top 100', index=False)
    
    # Sector summary
    sector_summary = df.groupby('Sector').agg({
        'Symbol': 'count',
        'Market Cap(Cr.)': 'sum',
        'P/E': 'mean'
    })
    sector_summary.to_excel(writer, sheet_name='Sector Summary')

CSV

import pandas as pd
import gzip
import json

with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt') as f:
    df = pd.DataFrame(json.load(f))

# Export all stocks
df.to_csv('all_stocks.csv', index=False)

# Export filtered subset (high ROE stocks)
high_roe = df[df['ROE(%)'] > 20]
high_roe.to_csv('high_roe_stocks.csv', index=False)

SQLite Database

import sqlite3
import pandas as pd
import gzip
import json

with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt') as f:
    df = pd.DataFrame(json.load(f))

# Create SQLite database
conn = sqlite3.connect('stocks.db')
df.to_sql('stocks', conn, if_exists='replace', index=False)

# Query the database
result = pd.read_sql_query("""
    SELECT Symbol, "Stock Price(₹)", "Market Cap(Cr.)", "P/E"
    FROM stocks
    WHERE Sector = 'IT' AND "P/E" < 30
    ORDER BY "Market Cap(Cr.)" DESC
    LIMIT 10
""", conn)

print(result)
conn.close()

Performance Optimization

Stream Processing for Large Files

For extremely large datasets, stream processing avoids loading the entire file into memory:
import gzip
import json

# Process stocks one at a time (memory-efficient)
with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt') as f:
    # Read opening bracket
    f.read(1)
    
    high_pe_count = 0
    for line in f:
        if line.strip() in [']', ',']:
            continue
        stock = json.loads(line.rstrip(','))
        if stock.get('P/E', 0) > 50:
            high_pe_count += 1
            print(f"{stock['Symbol']}: P/E {stock['P/E']}")
    
    print(f"\nTotal high P/E stocks: {high_pe_count}")
The current output file (~2 MB compressed, ~50 MB uncompressed) easily fits in memory on most systems. Stream processing is only needed for significantly larger datasets.

Data Validation

Check for Missing Fields

import gzip
import json

with gzip.open('all_stocks_fundamental_analysis.json.gz', 'rt') as f:
    stocks = json.load(f)

# Required fields
required = ['Symbol', 'Stock Price(₹)', 'Market Cap(Cr.)', 'P/E']

missing_data = []
for stock in stocks:
    for field in required:
        if field not in stock or stock[field] in [None, 0, '']:
            missing_data.append((stock.get('Symbol', 'UNKNOWN'), field))

if missing_data:
    print(f"Found {len(missing_data)} missing field instances:")
    for symbol, field in missing_data[:20]:
        print(f"  {symbol}: {field}")
else:
    print("All required fields present!")

Verify Data Freshness

import os
import datetime

file_path = 'all_stocks_fundamental_analysis.json.gz'
mod_time = os.path.getmtime(file_path)
mod_datetime = datetime.datetime.fromtimestamp(mod_time)
age_hours = (datetime.datetime.now() - mod_datetime).total_seconds() / 3600

print(f"File last updated: {mod_datetime}")
print(f"Age: {age_hours:.1f} hours")

if age_hours > 24:
    print("⚠️  Data is more than 24 hours old. Consider running pipeline update.")
else:
    print("✅ Data is fresh.")

Next Steps

Output Schema

Complete reference of all 86 fields

Custom Filters

Create advanced stock screening filters

Incremental Updates

Keep your data fresh with daily updates

Build docs developers (and LLMs) love