refactor(scrape): complete rewrite of content extraction - per-turn parsing for all types

This commit is contained in:
2026-03-07 22:57:04 +09:00
parent 22f7280907
commit 9281c6b45d
3 changed files with 268 additions and 146 deletions

View File

@@ -643,6 +643,125 @@ body {
}
/* Chat Input */
/* --- 사용자 메시지 --- */
.msg-user {
background: var(--accent-primary);
color: white;
padding: 8px 14px;
border-radius: 12px 12px 4px 12px;
margin: 4px 0;
margin-left: auto;
max-width: 80%;
font-size: 13px;
line-height: 1.5;
width: fit-content;
}
/* --- 상태 표시 --- */
.msg-status {
font-size: 12px;
color: var(--text-muted, var(--text-secondary));
opacity: 0.6;
padding: 2px 0;
font-style: italic;
}
/* --- 마크다운 렌더링 콘텐츠 --- */
.msg-text table {
border-collapse: collapse;
width: 100%;
margin: 8px 0;
font-size: 12px;
}
.msg-text table th,
.msg-text table td {
border: 1px solid var(--border-subtle);
padding: 6px 10px;
text-align: left;
}
.msg-text table th {
background: var(--bg-tertiary);
font-weight: 600;
color: var(--text-primary);
}
.msg-text table td {
color: var(--text-secondary);
}
.msg-text blockquote {
border-left: 3px solid var(--accent-primary);
padding: 4px 12px;
margin: 8px 0;
color: var(--text-secondary);
font-style: italic;
}
.msg-text ul,
.msg-text ol {
padding-left: 20px;
margin: 4px 0;
}
.msg-text li {
margin: 2px 0;
}
.msg-text h1,
.msg-text h2,
.msg-text h3,
.msg-text h4 {
margin: 8px 0 4px;
color: var(--text-primary);
}
.msg-text h3 {
font-size: 14px;
}
.msg-text h4 {
font-size: 13px;
}
.msg-text a {
color: var(--accent-primary);
text-decoration: underline;
}
.msg-text code {
background: var(--bg-tertiary);
padding: 1px 4px;
border-radius: 3px;
font-family: var(--font-mono);
font-size: 12px;
}
.msg-text pre {
background: var(--bg-tertiary);
padding: 10px;
border-radius: 6px;
overflow-x: auto;
margin: 6px 0;
}
.msg-text pre code {
background: none;
padding: 0;
}
.msg-text hr {
border: none;
border-top: 1px solid var(--border-subtle);
margin: 8px 0;
}
.msg-text strong {
color: var(--text-primary);
}
.chat-input-area {
display: flex;
align-items: flex-end;

View File

@@ -103,6 +103,8 @@ class ChatPanel {
case 'code': return this._renderCode(msg);
case 'image': return this._renderImage(msg);
case 'actions': return this._renderActions(msg);
case 'user': return this._renderUser(msg);
case 'status': return this._renderStatus(msg);
default: return null;
}
}
@@ -353,4 +355,24 @@ class ChatPanel {
_scrollToBottom() {
this.messagesEl.scrollTop = this.messagesEl.scrollHeight;
}
/**
* 사용자 메시지 렌더링
*/
_renderUser(msg) {
const wrapper = document.createElement('div');
wrapper.className = 'msg-user';
wrapper.textContent = msg.content;
return wrapper;
}
/**
* 상태 표시 (Running, Generating 등)
*/
_renderStatus(msg) {
const wrapper = document.createElement('div');
wrapper.className = 'msg-status';
wrapper.textContent = msg.content;
return wrapper;
}
}

View File

@@ -121,171 +121,152 @@ class CDPClient {
if (!scrollEl) return JSON.stringify([]);
const messages = [];
// 뷰포트에 실제 렌더링된 최상위 컨테이너 찾기
const topContainer = scrollEl.querySelector('.mx-auto.w-full > div > div');
if (!topContainer) return JSON.stringify([]);
// 각 turn(대화 턴)을 순회
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
// 유틸: 액션 버튼 추출
function extractActions(container) {
return Array.from(container.querySelectorAll('button')).map(b => {
const label = b.textContent.trim();
const rect = b.getBoundingClientRect();
return {
label,
x: Math.round(rect.left + rect.width / 2),
y: Math.round(rect.top + rect.height / 2),
w: Math.round(rect.width),
h: Math.round(rect.height),
};
}).filter(b => b.label && b.w > 0 && actionKeywords.some(k => b.label.includes(k)));
}
// 유틸: 마크다운 영역에서 콘텐츠를 추출
function extractContentBlocks(container) {
// select-text 또는 leading-relaxed 마크다운 렌더링 영역 찾기
const mkEls = container.querySelectorAll('.select-text .leading-relaxed, .leading-relaxed.select-text');
for (const mkEl of mkEls) {
// style 태그 제거
const clone = mkEl.cloneNode(true);
clone.querySelectorAll('style').forEach(s => s.remove());
const html = clone.innerHTML;
const text = clone.textContent.trim();
if (!text || text.length < 2) continue;
// CSS 필터
if (text.startsWith('/*') || text.includes('prefers-color-scheme')) continue;
messages.push({
type: 'text',
content: text.substring(0, 5000),
html: html.substring(0, 10000),
});
}
}
// 각 turn을 순회
const turns = topContainer.children;
for (let i = 0; i < turns.length; i++) {
const turn = turns[i];
// placeholder 블록 건너뛰기 (가상 스크롤)
const isPlaceholder = turn.children.length > 0 &&
// placeholder 건너뛰기
if (turn.children.length > 0 &&
Array.from(turn.children).every(c =>
c.classList.contains('rounded-lg') &&
c.classList.contains('bg-gray-500/10') &&
c.textContent.trim() === ''
);
if (isPlaceholder) continue;
)) continue;
// 턴 내부의 각 메시지 블록 순회
const blocks = turn.querySelectorAll(':scope > *');
for (const block of blocks) {
// placeholder 개별 블록도 건너뛰기
if (block.classList.contains('bg-gray-500/10') && block.textContent.trim() === '') continue;
// style 태그 미리 제거
turn.querySelectorAll('style').forEach(s => s.remove());
// 블록 내 style 태그 제거 (CSS 코드 누출 방지)
block.querySelectorAll('style').forEach(s => s.remove());
// --- 사용자 메시지 감지 (bg-gray-500/15 + select-text) ---
const userMsgEl = turn.querySelector('.bg-gray-500\\\\/15.select-text, .bg-gray-500\\\\/15 .select-text');
if (userMsgEl) {
const text = userMsgEl.textContent.trim();
if (text) {
messages.push({ type: 'user', content: text.substring(0, 2000) });
}
}
// --- 작업 카드 (task boundary) ---
const taskCard = block.querySelector('.isolate');
if (taskCard || block.classList.contains('isolate')) {
const card = taskCard || block;
const titleEl = card.querySelector('.font-semibold');
const summaryEl = card.querySelector('.text-sm .leading-relaxed');
const expanded = card.querySelector('[aria-expanded]');
// 하위 항목들 추출
const steps = [];
card.querySelectorAll('.flex.items-center.gap-2, .flex.w-full.items-center.gap-2').forEach(step => {
const txt = step.textContent.trim();
if (txt && txt.length > 2) {
const svg = step.querySelector('svg');
let icon = '';
if (svg) {
const cls = svg.getAttribute('class') || '';
if (cls.includes('check')) icon = '';
else if (cls.includes('loader') || cls.includes('spin')) icon = '⟳';
else if (cls.includes('x-circle') || cls.includes('alert')) icon = '⚠';
}
steps.push({ icon, text: txt.substring(0, 200) });
// --- isolate 카드 (task boundary) ---
const isolates = turn.querySelectorAll('.isolate');
for (const card of isolates) {
const titleEl = card.querySelector('.font-semibold');
const summaryEl = card.querySelector('.text-sm .leading-relaxed');
const expanded = card.querySelector('[aria-expanded]');
const steps = [];
card.querySelectorAll('.flex.items-center.gap-2, .flex.w-full.items-center.gap-2').forEach(step => {
const txt = step.textContent.trim();
if (txt && txt.length > 2) {
const svg = step.querySelector('svg');
let icon = '';
if (svg) {
const cls = svg.getAttribute('class') || '';
if (cls.includes('check')) icon = '✓';
else if (cls.includes('loader') || cls.includes('spin')) icon = '';
else if (cls.includes('x-circle') || cls.includes('alert')) icon = '';
}
});
// 카드 내부 액션 버튼 추출 (Cancel, Review Changes 등)
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
const cardBtns = Array.from(card.querySelectorAll('button')).map(b => {
const label = b.textContent.trim();
const rect = b.getBoundingClientRect();
return {
label,
x: Math.round(rect.left + rect.width / 2),
y: Math.round(rect.top + rect.height / 2),
w: Math.round(rect.width),
h: Math.round(rect.height),
};
}).filter(b => b.label && b.w > 0 && actionKeywords.some(k => b.label.includes(k)));
messages.push({
type: 'task',
title: titleEl ? titleEl.textContent.trim() : '',
summary: summaryEl ? summaryEl.textContent.trim().substring(0, 500) : '',
collapsed: expanded ? expanded.getAttribute('aria-expanded') === 'false' : true,
steps: steps.slice(0, 20),
actions: cardBtns.slice(0, 5),
});
continue;
}
// --- Thought Process ---
const thoughtBtn = block.querySelector('button');
if (thoughtBtn && thoughtBtn.textContent.includes('Thought for')) {
messages.push({
type: 'thought',
label: thoughtBtn.textContent.trim(),
collapsed: true,
});
continue;
}
// --- 코드 블록 ---
const pre = block.querySelector('pre');
if (pre && !block.querySelector('.isolate')) {
const codeEl = pre.querySelector('code');
const lang = codeEl ? (codeEl.className.match(/language-(\\w+)/) || [])[1] || '' : '';
messages.push({
type: 'code',
language: lang,
content: (codeEl || pre).textContent.substring(0, 2000),
});
continue;
}
// --- 이미지 ---
const img = block.querySelector('img');
if (img && img.src) {
messages.push({
type: 'image',
src: img.src,
alt: img.alt || '',
width: img.naturalWidth || img.width || 200,
height: img.naturalHeight || img.height || 150,
});
continue;
}
// --- 버튼 영역 (Proceed, Cancel 등) ---
const actionBtns = block.querySelectorAll('button');
if (actionBtns.length > 0) {
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
const buttons = Array.from(actionBtns).map(b => {
const label = b.textContent.trim();
const rect = b.getBoundingClientRect();
return {
label,
x: Math.round(rect.left + rect.width / 2),
y: Math.round(rect.top + rect.height / 2),
w: Math.round(rect.width),
h: Math.round(rect.height),
};
}).filter(b => b.label && b.w > 0);
if (buttons.length > 0 && buttons.some(b => actionKeywords.some(k => b.label.includes(k)))) {
messages.push({
type: 'actions',
buttons: buttons.slice(0, 8),
});
continue;
steps.push({ icon, text: txt.substring(0, 200) });
}
});
const cardBtns = extractActions(card);
messages.push({
type: 'task',
title: titleEl ? titleEl.textContent.trim() : '',
summary: summaryEl ? summaryEl.textContent.trim().substring(0, 500) : '',
collapsed: expanded ? expanded.getAttribute('aria-expanded') === 'false' : true,
steps: steps.slice(0, 20),
actions: cardBtns.slice(0, 5),
});
}
// --- Thought Process ---
const thoughtBtns = turn.querySelectorAll('button');
for (const btn of thoughtBtns) {
if (btn.textContent.includes('Thought for')) {
messages.push({ type: 'thought', label: btn.textContent.trim(), collapsed: true });
}
}
// --- 일반 텍스트 ---
// style 태그 내용을 제외한 순수 텍스트만 추출
const cloned = block.cloneNode(true);
cloned.querySelectorAll('style').forEach(s => s.remove());
const text = cloned.textContent.trim();
if (text.length > 0) {
// CSS 코드/내부 스타일 건너뛰기
if (text.startsWith('/*') || text.startsWith('@media') ||
text.includes('prefers-color-scheme') ||
text.includes('{') && text.includes('}') && text.includes(':') && text.includes(';') && text.length < 2000 ||
text.startsWith('.markdown-alert')) continue;
// --- isolate 바깥의 마크다운 콘텐츠 ---
// (isolate 내부가 아닌 마크다운 블록)
const allMkEls = turn.querySelectorAll('.leading-relaxed.select-text, .select-text .leading-relaxed');
for (const mkEl of allMkEls) {
// isolate 내부면 건너뛰기 (이미 task로 처리)
if (mkEl.closest('.isolate')) continue;
// leading-relaxed select-text → 마크다운 렌더링 텍스트
const mkEl = block.querySelector('.leading-relaxed.select-text');
// HTML에서도 style 태그를 DOM으로 제거
const htmlSrc = mkEl || block;
const htmlClone = htmlSrc.cloneNode(true);
htmlClone.querySelectorAll('style').forEach(s => s.remove());
const htmlContent = htmlClone.innerHTML;
messages.push({
type: 'text',
content: text.substring(0, 3000),
html: htmlContent.substring(0, 5000),
});
const clone = mkEl.cloneNode(true);
clone.querySelectorAll('style').forEach(s => s.remove());
const html = clone.innerHTML;
const text = clone.textContent.trim();
if (!text || text.length < 2) continue;
if (text.startsWith('/*') || text.includes('prefers-color-scheme')) continue;
messages.push({
type: 'text',
content: text.substring(0, 5000),
html: html.substring(0, 10000),
});
}
// --- isolate 바깥 독립 코드/이미지/상태 ---
const turnBlocks = turn.querySelectorAll(':scope > *');
for (const block of turnBlocks) {
if (block.querySelector('.isolate') || block.classList.contains('isolate')) continue;
// 상태 텍스트 (Running, Generating 등)
if (block.classList.contains('whitespace-nowrap')) {
const st = block.textContent.trim();
if (st) messages.push({ type: 'status', content: st });
continue;
}
}
}