const puppeteer = require('puppeteer');
const fs = require('fs').promises;
const PDFDocument = require('pdfkit');
class StriverSDEScraper {
constructor() {
this.baseUrl = 'https://takeuforward.org';
this.sheetUrl = 'https://takeuforward.org/interviews/strivers-sde-sheet-
top-coding-interview-problems';
this.questionsData = [];
this.browser = null;
this.page = null;
}
async init() {
console.log('🚀 Initializing browser...');
this.browser = await puppeteer.launch({
headless: false, // Set to true for production
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
this.page = await this.browser.newPage();
// Set user agent
await this.page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
}
async getQuestionLinks() {
console.log('🔍 Fetching SDE sheet page...');
try {
await this.page.goto(this.sheetUrl, { waitUntil: 'networkidle2' });
// Extract question links
const questionLinks = await this.page.evaluate(() => {
const links = [];
const anchors =
document.querySelectorAll('a[href*="takeuforward.org"]');
anchors.forEach(anchor => {
const href = anchor.href;
const text = anchor.textContent.trim();
// Filter for actual question links
if (href && text &&
(href.includes('data-structure') ||
href.includes('algorithm') ||
href.includes('problem') ||
href.includes('solution'))) {
links.push({
title: text,
url: href
});
}
});
// Remove duplicates
const uniqueLinks = [];
const seen = new Set();
links.forEach(link => {
if (!seen.has(link.url)) {
seen.add(link.url);
uniqueLinks.push(link);
}
});
return uniqueLinks;
});
console.log(`✅ Found ${questionLinks.length} question links`);
return questionLinks;
} catch (error) {
console.error('❌ Error fetching question links:', error);
return [];
}
}
async scrapeQuestionContent(questionUrl, title) {
console.log(`📖 Scraping: ${title}`);
try {
await this.page.goto(questionUrl, { waitUntil: 'networkidle2' });
const questionData = await this.page.evaluate(() => {
const data = {
title: document.title || 'Unknown',
description: '',
approaches: [],
timeComplexity: '',
spaceComplexity: ''
};
// Extract description
const contentDiv = document.querySelector('.entry-content, .post-
content, .content');
if (contentDiv) {
const paragraphs = contentDiv.querySelectorAll('p');
const descParagraphs = Array.from(paragraphs)
.slice(0, 3)
.map(p => p.textContent.trim())
.filter(text => text.length > 20);
data.description = descParagraphs.join('\n');
}
// Extract code blocks
const codeBlocks = document.querySelectorAll('pre,
code, .highlight');
const approaches = [];
codeBlocks.forEach((block, index) => {
const codeText = block.textContent.trim();
if (codeText.length > 50) {
let approachName = 'Solution';
if (index === 0) {
approachName = 'Brute Force';
} else if (codeText.toLowerCase().includes('optimal') ||
index === codeBlocks.length - 1) {
approachName = 'Optimal Solution';
} else if (index > 0) {
approachName = `Approach ${index + 1}`;
}
approaches.push({
name: approachName,
code: codeText,
timeComplexity: extractComplexity(codeText, 'time'),
spaceComplexity: extractComplexity(codeText, 'space')
});
}
});
data.approaches = approaches;
// Extract complexity from text
const fullText = document.body.textContent.toLowerCase();
data.timeComplexity = extractComplexityFromText(fullText, 'time');
data.spaceComplexity = extractComplexityFromText(fullText,
'space');
// Helper functions
function extractComplexity(text, type) {
const patterns = [
/O\([^)]+\)/gi,
/time[:\s]*O\([^)]+\)/gi,
/space[:\s]*O\([^)]+\)/gi
];
for (const pattern of patterns) {
const matches = text.match(pattern);
if (matches) {
return matches[0];
}
}
return 'Not specified';
}
function extractComplexityFromText(text, type) {
let patterns;
if (type === 'time') {
patterns = [/time complexity[:\s]*O\([^)]+\)/gi, /time[:\
s]*O\([^)]+\)/gi];
} else {
patterns = [/space complexity[:\s]*O\([^)]+\)/gi, /space[:\
s]*O\([^)]+\)/gi];
}
for (const pattern of patterns) {
const match = text.match(pattern);
if (match) {
const complexityMatch = match[0].match(/O\([^)]+\)/);
if (complexityMatch) {
return complexityMatch[0];
}
}
}
return 'Not specified';
}
return data;
});
questionData.title = title;
questionData.url = questionUrl;
// Wait between requests
await new Promise(resolve => setTimeout(resolve, 2000));
return questionData;
} catch (error) {
console.error(`❌ Error scraping ${title}:`, error);
return null;
}
}
async scrapeAllQuestions() {
console.log('🚀 Starting Striver SDE Sheet scraping...');
await this.init();
try {
// Get all question links
const questionLinks = await this.getQuestionLinks();
if (questionLinks.length === 0) {
console.log('❌ No question links found!');
return;
}
console.log(`📚 Found ${questionLinks.length} questions to scrape`);
// Scrape each question
for (let i = 0; i < questionLinks.length; i++) {
const link = questionLinks[i];
console.log(`\n[${i + 1}/${questionLinks.length}] Processing...`);
const questionData = await this.scrapeQuestionContent(link.url,
link.title);
if (questionData) {
this.questionsData.push(questionData);
console.log(`✅ Scraped: ${questionData.title}`);
} else {
console.log(`❌ Failed to scrape: ${link.title}`);
}
// Progress update
if ((i + 1) % 10 === 0) {
console.log(`\n📊 Progress: ${i + 1}/${questionLinks.length}
questions completed`);
}
}
console.log(`\n🎉 Scraping completed! Total questions: $
{this.questionsData.length}`);
// Save to JSON
await this.saveToJson();
// Generate PDF
await this.generatePDF();
} finally {
if (this.browser) {
await this.browser.close();
}
}
}
async saveToJson() {
const filename = 'striver_sde_questions.json';
await fs.writeFile(filename, JSON.stringify(this.questionsData, null, 2));
console.log(`💾 Data saved to ${filename}`);
}
async generatePDF() {
console.log('📄 Generating PDF...');
try {
const filename = 'Striver_SDE_Sheet_Complete.pdf';
const doc = new PDFDocument();
const stream = require('fs').createWriteStream(filename);
doc.pipe(stream);
// Title page
doc.fontSize(20).text('Striver SDE Sheet - Complete Solutions', 50,
50);
doc.fontSize(12).text(`Total Questions: ${this.questionsData.length}`,
50, 100);
doc.text(`Generated on: ${new Date().toLocaleDateString()}`, 50, 120);
// Questions
let yPosition = 180;
this.questionsData.forEach((question, index) => {
// Check if new page needed
if (yPosition > 700) {
doc.addPage();
yPosition = 50;
}
// Question title
doc.fontSize(14).text(`${index + 1}. ${question.title}`, 50,
yPosition);
yPosition += 30;
// Description
if (question.description) {
doc.fontSize(10).text('Problem Description:', 50, yPosition);
yPosition += 15;
doc.text(question.description.substring(0, 500), 50, yPosition,
{ width: 500 });
yPosition += Math.ceil(question.description.length / 80) * 12 +
20;
}
// Approaches
question.approaches.forEach(approach => {
if (yPosition > 650) {
doc.addPage();
yPosition = 50;
}
doc.fontSize(12).text(approach.name, 60, yPosition);
yPosition += 20;
doc.fontSize(10).text(`Time: ${approach.timeComplexity} |
Space: ${approach.spaceComplexity}`, 60, yPosition);
yPosition += 15;
// Code (truncated for PDF)
const codeLines = approach.code.split('\n').slice(0, 10);
codeLines.forEach(line => {
if (yPosition > 750) {
doc.addPage();
yPosition = 50;
}
doc.text(line.substring(0, 80), 60, yPosition);
yPosition += 12;
});
yPosition += 20;
});
yPosition += 30;
});
doc.end();
stream.on('finish', () => {
console.log(`✅ PDF generated: ${filename}`);
});
} catch (error) {
console.error('❌ Error generating PDF:', error);
}
}
}
async function main() {
const scraper = new StriverSDEScraper();
await scraper.scrapeAllQuestions();
}
// Run the scraper
main().catch(console.error);