Platzi Viewer streams all course content from Google Drive using a service account. No files are stored locally—only metadata (file IDs, course structure) is cached in courses_cache.json.
def _get_session(self): # Shared session avoids cold-start latency on each per-request thread. if self._shared_session is None: with self._shared_session_lock: if self._shared_session is None: session = AuthorizedSession(self.creds) session.headers.update({"Accept-Encoding": "identity"}) self._shared_session = session return self._shared_session
Accept-Encoding: identity is set to disable Drive API compression. We want raw bytes for video streaming—compression would break Range requests.
def download_file_range(self, file_id, start=None, end=None, range_header=None): """Descarga un rango de bytes de un archivo.""" file_id = self._validate_drive_id(file_id, "file_id") session = self._get_session() url = f"https://www.googleapis.com/drive/v3/files/{file_id}" headers = {} params = { "alt": "media", "supportsAllDrives": "true", } if range_header: headers["Range"] = str(range_header).strip() elif start is not None or end is not None: generated_range = f"bytes={start if start is not None else ''}-{end if end is not None else ''}" headers["Range"] = generated_range response = session.get(url, headers=headers, params=params, stream=True, timeout=(5, 60)) response.raise_for_status() return response
The method accepts:
range_header: Raw Range header from client (e.g., bytes=0-1048575)
def main(): print("📦 Rebuilding courses_cache.json from Google Drive") # 1. Parse PlatziRoutes.md parsed = parse_routes.parse() categories = parsed["categories"] # 2. List all course folders from Drive root root_items = list_drive_folder(DRIVE_ROOT_ID) drive_folders = [f for f in root_items if f["mimeType"] == "application/vnd.google-apps.folder"] # Build Drive lookup: sanitized name -> {name, id} drive_map = {} for df in drive_folders: san = sanitize_for_match(df["name"]) drive_map[san] = {"name": df["name"], "id": df["id"]} # 3. Load previous scan progress (for resume) scanned = load_scan_progress() # 4. Match courses and scan Drive content for cat in categories: for route in cat["routes"]: for course in route.get("courses", []): md_name = course["name"] drive_folder_name, drive_folder_id, match_type = match_course_to_drive(md_name, drive_san, drive_map) if drive_folder_name: # Check if already scanned if drive_folder_id in scanned: # Use cached scan else: # Scan Drive for modules/classes modules, has_pres, pres_id = scan_drive_course(drive_folder_id, drive_folder_name) # Save progress scanned[drive_folder_id] = {...} # 5. Save courses_cache.json with open(OUTPUT_FILE, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2)
def match_course_to_drive(md_name, drive_names_san, drive_names_map): """Try to match an MD course name to a Drive folder.""" san = sanitize_for_match(md_name) # 1. Exact match if san in drive_names_san: info = drive_names_map[san] return info["name"], info["id"], "exact" # 2. MD name starts with Drive name for ds, info in drive_names_map.items(): if san.startswith(ds) and len(ds) > 20: return info["name"], info["id"], "prefix" # 3. Drive name starts with MD name for ds, info in drive_names_map.items(): if ds.startswith(san) and len(san) > 20: return info["name"], info["id"], "prefix" # 4. High word overlap san_words = set(san.split()) best_match = None best_overlap = 0 for ds, info in drive_names_map.items(): ds_words = set(ds.split()) overlap = len(san_words & ds_words) min_len = min(len(san_words), len(ds_words)) if min_len > 0 and overlap / min_len >= 0.8 and overlap > best_overlap: best_overlap = overlap best_match = (info["name"], info["id"], "fuzzy") if best_match and best_overlap >= 4: return best_match return None, None, None
The matcher uses fuzzy logic to handle name variations:
“React Course 2024” matches “React Course”
“Curso de JavaScript Moderno” matches “JavaScript Moderno”
def scan_drive_classes(folder_id): """Scan class files inside a Drive module folder.""" classes = [] files = list_drive_folder(folder_id) # Group files by class number class_files = {} for f in files: name = f["name"] parts = name.split(". ", 1) if len(parts) >= 2 and parts[0].isdigit(): num = int(parts[0]) if num not in class_files: class_files[num] = [] class_files[num].append(f) for num in sorted(class_files.keys()): flist = class_files[num] video = summary = vtt = reading = html = None video_id = summary_id = vtt_id = reading_id = html_id = None resources = [] for f in flist: fname = f["name"] fid = f["id"] if fname.endswith(".mp4"): video = fname video_id = fid elif fname.endswith("_summary.html"): summary = fname summary_id = fid elif fname.endswith(".vtt"): vtt = fname vtt_id = fid # ... other file types classes.append({ "num": num, "name": name[:60], "hasVideo": video is not None, "hasSummary": summary is not None, "files": { "video": video_id, "summary": summary_id, "subtitles": vtt_id, # ... }, "resources": resources, }) return classes
def api_call_throttle(): """Simple rate limiter to avoid hitting Drive API quotas.""" global API_CALL_COUNT, API_CALL_START API_CALL_COUNT += 1 elapsed = time.time() - API_CALL_START # Google Drive API: 12,000 queries per minute for service accounts # Be conservative: max ~100 calls per second if API_CALL_COUNT % 50 == 0 and elapsed < 1.0: wait = 1.0 - elapsed time.sleep(wait) API_CALL_START = time.time() API_CALL_COUNT = 0
Google Drive API has a quota of 12,000 queries per minute. The throttle ensures we don’t exceed ~100 calls/second during cache rebuild.
def load_scan_progress(): """Load previously scanned course data to allow resuming.""" if os.path.exists(PROGRESS_FILE): with open(PROGRESS_FILE, "r", encoding="utf-8") as f: return json.load(f) return {}def save_scan_progress(progress): """Save scan progress for resume capability.""" with open(PROGRESS_FILE, "w", encoding="utf-8") as f: json.dump(progress, f, ensure_ascii=False)
Progress is saved every 10 courses:
if courses_scanned_this_run % 10 == 0: save_scan_progress(scanned) print(f"💾 Progress saved ({courses_scanned_this_run} scanned this run)")
If rebuild_cache_drive.py is interrupted, re-running it will resume from the last checkpoint, skipping already-scanned courses.
From drive_service.py:14 and drive_service.py:92-95:
DRIVE_ID_RE = re.compile(r"^[A-Za-z0-9_-]{10,}$")def _validate_drive_id(self, value, field_name="id"): if not isinstance(value, str) or not DRIVE_ID_RE.match(value.strip()): raise ValueError(f"Invalid Google Drive {field_name}") return value.strip()
All Drive file IDs are validated before API calls to prevent injection attacks.
The server rejectslocal: prefixed file references (from old local-file mode). Only valid Drive IDs (10+ alphanumeric/-/_ characters) are accepted.
Cause: Service account doesn’t have access to the Drive folder.Fix: Share the folder with the service account email (found in service_account.json under client_email).
401 Unauthorized
Cause: Invalid or expired credentials.Fix: Download a fresh service_account.json from Google Cloud Console.
404 Not Found
Cause: Incorrect DRIVE_ROOT_ID or file ID.Fix: Verify the folder ID in the Drive URL: drive.google.com/drive/folders/{ID}
Slow Streaming
Cause: Drive API rate limiting or network latency.Fix:
Check /api/health for metrics
Use FFmpeg compatibility mode for problematic files