refactor(scrape): complete rewrite of content extraction - per-turn parsing for all types

This commit is contained in:
2026-03-07 22:57:04 +09:00
parent 22f7280907
commit 9281c6b45d
3 changed files with 268 additions and 146 deletions

View File

@@ -121,171 +121,152 @@ class CDPClient {
if (!scrollEl) return JSON.stringify([]);
const messages = [];
// 뷰포트에 실제 렌더링된 최상위 컨테이너 찾기
const topContainer = scrollEl.querySelector('.mx-auto.w-full > div > div');
if (!topContainer) return JSON.stringify([]);
// 각 turn(대화 턴)을 순회
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
// 유틸: 액션 버튼 추출
function extractActions(container) {
return Array.from(container.querySelectorAll('button')).map(b => {
const label = b.textContent.trim();
const rect = b.getBoundingClientRect();
return {
label,
x: Math.round(rect.left + rect.width / 2),
y: Math.round(rect.top + rect.height / 2),
w: Math.round(rect.width),
h: Math.round(rect.height),
};
}).filter(b => b.label && b.w > 0 && actionKeywords.some(k => b.label.includes(k)));
}
// 유틸: 마크다운 영역에서 콘텐츠를 추출
function extractContentBlocks(container) {
// select-text 또는 leading-relaxed 마크다운 렌더링 영역 찾기
const mkEls = container.querySelectorAll('.select-text .leading-relaxed, .leading-relaxed.select-text');
for (const mkEl of mkEls) {
// style 태그 제거
const clone = mkEl.cloneNode(true);
clone.querySelectorAll('style').forEach(s => s.remove());
const html = clone.innerHTML;
const text = clone.textContent.trim();
if (!text || text.length < 2) continue;
// CSS 필터
if (text.startsWith('/*') || text.includes('prefers-color-scheme')) continue;
messages.push({
type: 'text',
content: text.substring(0, 5000),
html: html.substring(0, 10000),
});
}
}
// 각 turn을 순회
const turns = topContainer.children;
for (let i = 0; i < turns.length; i++) {
const turn = turns[i];
// placeholder 블록 건너뛰기 (가상 스크롤)
const isPlaceholder = turn.children.length > 0 &&
// placeholder 건너뛰기
if (turn.children.length > 0 &&
Array.from(turn.children).every(c =>
c.classList.contains('rounded-lg') &&
c.classList.contains('bg-gray-500/10') &&
c.textContent.trim() === ''
);
if (isPlaceholder) continue;
)) continue;
// 턴 내부의 각 메시지 블록 순회
const blocks = turn.querySelectorAll(':scope > *');
for (const block of blocks) {
// placeholder 개별 블록도 건너뛰기
if (block.classList.contains('bg-gray-500/10') && block.textContent.trim() === '') continue;
// style 태그 미리 제거
turn.querySelectorAll('style').forEach(s => s.remove());
// 블록 내 style 태그 제거 (CSS 코드 누출 방지)
block.querySelectorAll('style').forEach(s => s.remove());
// --- 사용자 메시지 감지 (bg-gray-500/15 + select-text) ---
const userMsgEl = turn.querySelector('.bg-gray-500\\\\/15.select-text, .bg-gray-500\\\\/15 .select-text');
if (userMsgEl) {
const text = userMsgEl.textContent.trim();
if (text) {
messages.push({ type: 'user', content: text.substring(0, 2000) });
}
}
// --- 작업 카드 (task boundary) ---
const taskCard = block.querySelector('.isolate');
if (taskCard || block.classList.contains('isolate')) {
const card = taskCard || block;
const titleEl = card.querySelector('.font-semibold');
const summaryEl = card.querySelector('.text-sm .leading-relaxed');
const expanded = card.querySelector('[aria-expanded]');
// 하위 항목들 추출
const steps = [];
card.querySelectorAll('.flex.items-center.gap-2, .flex.w-full.items-center.gap-2').forEach(step => {
const txt = step.textContent.trim();
if (txt && txt.length > 2) {
const svg = step.querySelector('svg');
let icon = '';
if (svg) {
const cls = svg.getAttribute('class') || '';
if (cls.includes('check')) icon = '';
else if (cls.includes('loader') || cls.includes('spin')) icon = '⟳';
else if (cls.includes('x-circle') || cls.includes('alert')) icon = '⚠';
}
steps.push({ icon, text: txt.substring(0, 200) });
// --- isolate 카드 (task boundary) ---
const isolates = turn.querySelectorAll('.isolate');
for (const card of isolates) {
const titleEl = card.querySelector('.font-semibold');
const summaryEl = card.querySelector('.text-sm .leading-relaxed');
const expanded = card.querySelector('[aria-expanded]');
const steps = [];
card.querySelectorAll('.flex.items-center.gap-2, .flex.w-full.items-center.gap-2').forEach(step => {
const txt = step.textContent.trim();
if (txt && txt.length > 2) {
const svg = step.querySelector('svg');
let icon = '';
if (svg) {
const cls = svg.getAttribute('class') || '';
if (cls.includes('check')) icon = '✓';
else if (cls.includes('loader') || cls.includes('spin')) icon = '';
else if (cls.includes('x-circle') || cls.includes('alert')) icon = '';
}
});
// 카드 내부 액션 버튼 추출 (Cancel, Review Changes 등)
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
const cardBtns = Array.from(card.querySelectorAll('button')).map(b => {
const label = b.textContent.trim();
const rect = b.getBoundingClientRect();
return {
label,
x: Math.round(rect.left + rect.width / 2),
y: Math.round(rect.top + rect.height / 2),
w: Math.round(rect.width),
h: Math.round(rect.height),
};
}).filter(b => b.label && b.w > 0 && actionKeywords.some(k => b.label.includes(k)));
messages.push({
type: 'task',
title: titleEl ? titleEl.textContent.trim() : '',
summary: summaryEl ? summaryEl.textContent.trim().substring(0, 500) : '',
collapsed: expanded ? expanded.getAttribute('aria-expanded') === 'false' : true,
steps: steps.slice(0, 20),
actions: cardBtns.slice(0, 5),
});
continue;
}
// --- Thought Process ---
const thoughtBtn = block.querySelector('button');
if (thoughtBtn && thoughtBtn.textContent.includes('Thought for')) {
messages.push({
type: 'thought',
label: thoughtBtn.textContent.trim(),
collapsed: true,
});
continue;
}
// --- 코드 블록 ---
const pre = block.querySelector('pre');
if (pre && !block.querySelector('.isolate')) {
const codeEl = pre.querySelector('code');
const lang = codeEl ? (codeEl.className.match(/language-(\\w+)/) || [])[1] || '' : '';
messages.push({
type: 'code',
language: lang,
content: (codeEl || pre).textContent.substring(0, 2000),
});
continue;
}
// --- 이미지 ---
const img = block.querySelector('img');
if (img && img.src) {
messages.push({
type: 'image',
src: img.src,
alt: img.alt || '',
width: img.naturalWidth || img.width || 200,
height: img.naturalHeight || img.height || 150,
});
continue;
}
// --- 버튼 영역 (Proceed, Cancel 등) ---
const actionBtns = block.querySelectorAll('button');
if (actionBtns.length > 0) {
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
const buttons = Array.from(actionBtns).map(b => {
const label = b.textContent.trim();
const rect = b.getBoundingClientRect();
return {
label,
x: Math.round(rect.left + rect.width / 2),
y: Math.round(rect.top + rect.height / 2),
w: Math.round(rect.width),
h: Math.round(rect.height),
};
}).filter(b => b.label && b.w > 0);
if (buttons.length > 0 && buttons.some(b => actionKeywords.some(k => b.label.includes(k)))) {
messages.push({
type: 'actions',
buttons: buttons.slice(0, 8),
});
continue;
steps.push({ icon, text: txt.substring(0, 200) });
}
});
const cardBtns = extractActions(card);
messages.push({
type: 'task',
title: titleEl ? titleEl.textContent.trim() : '',
summary: summaryEl ? summaryEl.textContent.trim().substring(0, 500) : '',
collapsed: expanded ? expanded.getAttribute('aria-expanded') === 'false' : true,
steps: steps.slice(0, 20),
actions: cardBtns.slice(0, 5),
});
}
// --- Thought Process ---
const thoughtBtns = turn.querySelectorAll('button');
for (const btn of thoughtBtns) {
if (btn.textContent.includes('Thought for')) {
messages.push({ type: 'thought', label: btn.textContent.trim(), collapsed: true });
}
}
// --- 일반 텍스트 ---
// style 태그 내용을 제외한 순수 텍스트만 추출
const cloned = block.cloneNode(true);
cloned.querySelectorAll('style').forEach(s => s.remove());
const text = cloned.textContent.trim();
if (text.length > 0) {
// CSS 코드/내부 스타일 건너뛰기
if (text.startsWith('/*') || text.startsWith('@media') ||
text.includes('prefers-color-scheme') ||
text.includes('{') && text.includes('}') && text.includes(':') && text.includes(';') && text.length < 2000 ||
text.startsWith('.markdown-alert')) continue;
// --- isolate 바깥의 마크다운 콘텐츠 ---
// (isolate 내부가 아닌 마크다운 블록)
const allMkEls = turn.querySelectorAll('.leading-relaxed.select-text, .select-text .leading-relaxed');
for (const mkEl of allMkEls) {
// isolate 내부면 건너뛰기 (이미 task로 처리)
if (mkEl.closest('.isolate')) continue;
// leading-relaxed select-text → 마크다운 렌더링 텍스트
const mkEl = block.querySelector('.leading-relaxed.select-text');
// HTML에서도 style 태그를 DOM으로 제거
const htmlSrc = mkEl || block;
const htmlClone = htmlSrc.cloneNode(true);
htmlClone.querySelectorAll('style').forEach(s => s.remove());
const htmlContent = htmlClone.innerHTML;
messages.push({
type: 'text',
content: text.substring(0, 3000),
html: htmlContent.substring(0, 5000),
});
const clone = mkEl.cloneNode(true);
clone.querySelectorAll('style').forEach(s => s.remove());
const html = clone.innerHTML;
const text = clone.textContent.trim();
if (!text || text.length < 2) continue;
if (text.startsWith('/*') || text.includes('prefers-color-scheme')) continue;
messages.push({
type: 'text',
content: text.substring(0, 5000),
html: html.substring(0, 10000),
});
}
// --- isolate 바깥 독립 코드/이미지/상태 ---
const turnBlocks = turn.querySelectorAll(':scope > *');
for (const block of turnBlocks) {
if (block.querySelector('.isolate') || block.classList.contains('isolate')) continue;
// 상태 텍스트 (Running, Generating 등)
if (block.classList.contains('whitespace-nowrap')) {
const st = block.textContent.trim();
if (st) messages.push({ type: 'status', content: st });
continue;
}
}
}