refactor(scrape): complete rewrite of content extraction - per-turn parsing for all types
This commit is contained in:
@@ -121,171 +121,152 @@ class CDPClient {
|
||||
if (!scrollEl) return JSON.stringify([]);
|
||||
|
||||
const messages = [];
|
||||
|
||||
// 뷰포트에 실제 렌더링된 최상위 컨테이너 찾기
|
||||
const topContainer = scrollEl.querySelector('.mx-auto.w-full > div > div');
|
||||
if (!topContainer) return JSON.stringify([]);
|
||||
|
||||
// 각 turn(대화 턴)을 순회
|
||||
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
|
||||
|
||||
// 유틸: 액션 버튼 추출
|
||||
function extractActions(container) {
|
||||
return Array.from(container.querySelectorAll('button')).map(b => {
|
||||
const label = b.textContent.trim();
|
||||
const rect = b.getBoundingClientRect();
|
||||
return {
|
||||
label,
|
||||
x: Math.round(rect.left + rect.width / 2),
|
||||
y: Math.round(rect.top + rect.height / 2),
|
||||
w: Math.round(rect.width),
|
||||
h: Math.round(rect.height),
|
||||
};
|
||||
}).filter(b => b.label && b.w > 0 && actionKeywords.some(k => b.label.includes(k)));
|
||||
}
|
||||
|
||||
// 유틸: 마크다운 영역에서 콘텐츠를 추출
|
||||
function extractContentBlocks(container) {
|
||||
// select-text 또는 leading-relaxed 마크다운 렌더링 영역 찾기
|
||||
const mkEls = container.querySelectorAll('.select-text .leading-relaxed, .leading-relaxed.select-text');
|
||||
|
||||
for (const mkEl of mkEls) {
|
||||
// style 태그 제거
|
||||
const clone = mkEl.cloneNode(true);
|
||||
clone.querySelectorAll('style').forEach(s => s.remove());
|
||||
|
||||
const html = clone.innerHTML;
|
||||
const text = clone.textContent.trim();
|
||||
|
||||
if (!text || text.length < 2) continue;
|
||||
// CSS 필터
|
||||
if (text.startsWith('/*') || text.includes('prefers-color-scheme')) continue;
|
||||
|
||||
messages.push({
|
||||
type: 'text',
|
||||
content: text.substring(0, 5000),
|
||||
html: html.substring(0, 10000),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 각 turn을 순회
|
||||
const turns = topContainer.children;
|
||||
for (let i = 0; i < turns.length; i++) {
|
||||
const turn = turns[i];
|
||||
|
||||
// placeholder 블록 건너뛰기 (가상 스크롤)
|
||||
const isPlaceholder = turn.children.length > 0 &&
|
||||
// placeholder 건너뛰기
|
||||
if (turn.children.length > 0 &&
|
||||
Array.from(turn.children).every(c =>
|
||||
c.classList.contains('rounded-lg') &&
|
||||
c.classList.contains('bg-gray-500/10') &&
|
||||
c.textContent.trim() === ''
|
||||
);
|
||||
if (isPlaceholder) continue;
|
||||
)) continue;
|
||||
|
||||
// 턴 내부의 각 메시지 블록 순회
|
||||
const blocks = turn.querySelectorAll(':scope > *');
|
||||
for (const block of blocks) {
|
||||
// placeholder 개별 블록도 건너뛰기
|
||||
if (block.classList.contains('bg-gray-500/10') && block.textContent.trim() === '') continue;
|
||||
// style 태그 미리 제거
|
||||
turn.querySelectorAll('style').forEach(s => s.remove());
|
||||
|
||||
// 블록 내 style 태그 제거 (CSS 코드 누출 방지)
|
||||
block.querySelectorAll('style').forEach(s => s.remove());
|
||||
// --- 사용자 메시지 감지 (bg-gray-500/15 + select-text) ---
|
||||
const userMsgEl = turn.querySelector('.bg-gray-500\\\\/15.select-text, .bg-gray-500\\\\/15 .select-text');
|
||||
if (userMsgEl) {
|
||||
const text = userMsgEl.textContent.trim();
|
||||
if (text) {
|
||||
messages.push({ type: 'user', content: text.substring(0, 2000) });
|
||||
}
|
||||
}
|
||||
|
||||
// --- 작업 카드 (task boundary) ---
|
||||
const taskCard = block.querySelector('.isolate');
|
||||
if (taskCard || block.classList.contains('isolate')) {
|
||||
const card = taskCard || block;
|
||||
const titleEl = card.querySelector('.font-semibold');
|
||||
const summaryEl = card.querySelector('.text-sm .leading-relaxed');
|
||||
const expanded = card.querySelector('[aria-expanded]');
|
||||
|
||||
// 하위 항목들 추출
|
||||
const steps = [];
|
||||
card.querySelectorAll('.flex.items-center.gap-2, .flex.w-full.items-center.gap-2').forEach(step => {
|
||||
const txt = step.textContent.trim();
|
||||
if (txt && txt.length > 2) {
|
||||
const svg = step.querySelector('svg');
|
||||
let icon = '';
|
||||
if (svg) {
|
||||
const cls = svg.getAttribute('class') || '';
|
||||
if (cls.includes('check')) icon = '✓';
|
||||
else if (cls.includes('loader') || cls.includes('spin')) icon = '⟳';
|
||||
else if (cls.includes('x-circle') || cls.includes('alert')) icon = '⚠';
|
||||
}
|
||||
steps.push({ icon, text: txt.substring(0, 200) });
|
||||
// --- isolate 카드들 (task boundary) ---
|
||||
const isolates = turn.querySelectorAll('.isolate');
|
||||
for (const card of isolates) {
|
||||
const titleEl = card.querySelector('.font-semibold');
|
||||
const summaryEl = card.querySelector('.text-sm .leading-relaxed');
|
||||
const expanded = card.querySelector('[aria-expanded]');
|
||||
|
||||
const steps = [];
|
||||
card.querySelectorAll('.flex.items-center.gap-2, .flex.w-full.items-center.gap-2').forEach(step => {
|
||||
const txt = step.textContent.trim();
|
||||
if (txt && txt.length > 2) {
|
||||
const svg = step.querySelector('svg');
|
||||
let icon = '';
|
||||
if (svg) {
|
||||
const cls = svg.getAttribute('class') || '';
|
||||
if (cls.includes('check')) icon = '✓';
|
||||
else if (cls.includes('loader') || cls.includes('spin')) icon = '⟳';
|
||||
else if (cls.includes('x-circle') || cls.includes('alert')) icon = '⚠';
|
||||
}
|
||||
});
|
||||
|
||||
// 카드 내부 액션 버튼 추출 (Cancel, Review Changes 등)
|
||||
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
|
||||
const cardBtns = Array.from(card.querySelectorAll('button')).map(b => {
|
||||
const label = b.textContent.trim();
|
||||
const rect = b.getBoundingClientRect();
|
||||
return {
|
||||
label,
|
||||
x: Math.round(rect.left + rect.width / 2),
|
||||
y: Math.round(rect.top + rect.height / 2),
|
||||
w: Math.round(rect.width),
|
||||
h: Math.round(rect.height),
|
||||
};
|
||||
}).filter(b => b.label && b.w > 0 && actionKeywords.some(k => b.label.includes(k)));
|
||||
|
||||
messages.push({
|
||||
type: 'task',
|
||||
title: titleEl ? titleEl.textContent.trim() : '',
|
||||
summary: summaryEl ? summaryEl.textContent.trim().substring(0, 500) : '',
|
||||
collapsed: expanded ? expanded.getAttribute('aria-expanded') === 'false' : true,
|
||||
steps: steps.slice(0, 20),
|
||||
actions: cardBtns.slice(0, 5),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Thought Process ---
|
||||
const thoughtBtn = block.querySelector('button');
|
||||
if (thoughtBtn && thoughtBtn.textContent.includes('Thought for')) {
|
||||
messages.push({
|
||||
type: 'thought',
|
||||
label: thoughtBtn.textContent.trim(),
|
||||
collapsed: true,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 코드 블록 ---
|
||||
const pre = block.querySelector('pre');
|
||||
if (pre && !block.querySelector('.isolate')) {
|
||||
const codeEl = pre.querySelector('code');
|
||||
const lang = codeEl ? (codeEl.className.match(/language-(\\w+)/) || [])[1] || '' : '';
|
||||
messages.push({
|
||||
type: 'code',
|
||||
language: lang,
|
||||
content: (codeEl || pre).textContent.substring(0, 2000),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 이미지 ---
|
||||
const img = block.querySelector('img');
|
||||
if (img && img.src) {
|
||||
messages.push({
|
||||
type: 'image',
|
||||
src: img.src,
|
||||
alt: img.alt || '',
|
||||
width: img.naturalWidth || img.width || 200,
|
||||
height: img.naturalHeight || img.height || 150,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 버튼 영역 (Proceed, Cancel 등) ---
|
||||
const actionBtns = block.querySelectorAll('button');
|
||||
if (actionBtns.length > 0) {
|
||||
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
|
||||
const buttons = Array.from(actionBtns).map(b => {
|
||||
const label = b.textContent.trim();
|
||||
const rect = b.getBoundingClientRect();
|
||||
return {
|
||||
label,
|
||||
x: Math.round(rect.left + rect.width / 2),
|
||||
y: Math.round(rect.top + rect.height / 2),
|
||||
w: Math.round(rect.width),
|
||||
h: Math.round(rect.height),
|
||||
};
|
||||
}).filter(b => b.label && b.w > 0);
|
||||
|
||||
if (buttons.length > 0 && buttons.some(b => actionKeywords.some(k => b.label.includes(k)))) {
|
||||
messages.push({
|
||||
type: 'actions',
|
||||
buttons: buttons.slice(0, 8),
|
||||
});
|
||||
continue;
|
||||
steps.push({ icon, text: txt.substring(0, 200) });
|
||||
}
|
||||
});
|
||||
|
||||
const cardBtns = extractActions(card);
|
||||
|
||||
messages.push({
|
||||
type: 'task',
|
||||
title: titleEl ? titleEl.textContent.trim() : '',
|
||||
summary: summaryEl ? summaryEl.textContent.trim().substring(0, 500) : '',
|
||||
collapsed: expanded ? expanded.getAttribute('aria-expanded') === 'false' : true,
|
||||
steps: steps.slice(0, 20),
|
||||
actions: cardBtns.slice(0, 5),
|
||||
});
|
||||
}
|
||||
|
||||
// --- Thought Process ---
|
||||
const thoughtBtns = turn.querySelectorAll('button');
|
||||
for (const btn of thoughtBtns) {
|
||||
if (btn.textContent.includes('Thought for')) {
|
||||
messages.push({ type: 'thought', label: btn.textContent.trim(), collapsed: true });
|
||||
}
|
||||
}
|
||||
|
||||
// --- 일반 텍스트 ---
|
||||
// style 태그 내용을 제외한 순수 텍스트만 추출
|
||||
const cloned = block.cloneNode(true);
|
||||
cloned.querySelectorAll('style').forEach(s => s.remove());
|
||||
const text = cloned.textContent.trim();
|
||||
if (text.length > 0) {
|
||||
// CSS 코드/내부 스타일 건너뛰기
|
||||
if (text.startsWith('/*') || text.startsWith('@media') ||
|
||||
text.includes('prefers-color-scheme') ||
|
||||
text.includes('{') && text.includes('}') && text.includes(':') && text.includes(';') && text.length < 2000 ||
|
||||
text.startsWith('.markdown-alert')) continue;
|
||||
// --- isolate 바깥의 마크다운 콘텐츠 ---
|
||||
// (isolate 내부가 아닌 마크다운 블록)
|
||||
const allMkEls = turn.querySelectorAll('.leading-relaxed.select-text, .select-text .leading-relaxed');
|
||||
for (const mkEl of allMkEls) {
|
||||
// isolate 내부면 건너뛰기 (이미 task로 처리)
|
||||
if (mkEl.closest('.isolate')) continue;
|
||||
|
||||
// leading-relaxed select-text → 마크다운 렌더링 텍스트
|
||||
const mkEl = block.querySelector('.leading-relaxed.select-text');
|
||||
// HTML에서도 style 태그를 DOM으로 제거
|
||||
const htmlSrc = mkEl || block;
|
||||
const htmlClone = htmlSrc.cloneNode(true);
|
||||
htmlClone.querySelectorAll('style').forEach(s => s.remove());
|
||||
const htmlContent = htmlClone.innerHTML;
|
||||
|
||||
messages.push({
|
||||
type: 'text',
|
||||
content: text.substring(0, 3000),
|
||||
html: htmlContent.substring(0, 5000),
|
||||
});
|
||||
const clone = mkEl.cloneNode(true);
|
||||
clone.querySelectorAll('style').forEach(s => s.remove());
|
||||
|
||||
const html = clone.innerHTML;
|
||||
const text = clone.textContent.trim();
|
||||
|
||||
if (!text || text.length < 2) continue;
|
||||
if (text.startsWith('/*') || text.includes('prefers-color-scheme')) continue;
|
||||
|
||||
messages.push({
|
||||
type: 'text',
|
||||
content: text.substring(0, 5000),
|
||||
html: html.substring(0, 10000),
|
||||
});
|
||||
}
|
||||
|
||||
// --- isolate 바깥 독립 코드/이미지/상태 ---
|
||||
const turnBlocks = turn.querySelectorAll(':scope > *');
|
||||
for (const block of turnBlocks) {
|
||||
if (block.querySelector('.isolate') || block.classList.contains('isolate')) continue;
|
||||
|
||||
// 상태 텍스트 (Running, Generating 등)
|
||||
if (block.classList.contains('whitespace-nowrap')) {
|
||||
const st = block.textContent.trim();
|
||||
if (st) messages.push({ type: 'status', content: st });
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user