refactor(scrape): complete rewrite of content extraction - per-turn parsing for all types
This commit is contained in:
@@ -643,6 +643,125 @@ body {
|
||||
}
|
||||
|
||||
/* Chat Input */
|
||||
|
||||
/* --- 사용자 메시지 --- */
|
||||
.msg-user {
|
||||
background: var(--accent-primary);
|
||||
color: white;
|
||||
padding: 8px 14px;
|
||||
border-radius: 12px 12px 4px 12px;
|
||||
margin: 4px 0;
|
||||
margin-left: auto;
|
||||
max-width: 80%;
|
||||
font-size: 13px;
|
||||
line-height: 1.5;
|
||||
width: fit-content;
|
||||
}
|
||||
|
||||
/* --- 상태 표시 --- */
|
||||
.msg-status {
|
||||
font-size: 12px;
|
||||
color: var(--text-muted, var(--text-secondary));
|
||||
opacity: 0.6;
|
||||
padding: 2px 0;
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
/* --- 마크다운 렌더링 콘텐츠 --- */
|
||||
.msg-text table {
|
||||
border-collapse: collapse;
|
||||
width: 100%;
|
||||
margin: 8px 0;
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.msg-text table th,
|
||||
.msg-text table td {
|
||||
border: 1px solid var(--border-subtle);
|
||||
padding: 6px 10px;
|
||||
text-align: left;
|
||||
}
|
||||
|
||||
.msg-text table th {
|
||||
background: var(--bg-tertiary);
|
||||
font-weight: 600;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.msg-text table td {
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.msg-text blockquote {
|
||||
border-left: 3px solid var(--accent-primary);
|
||||
padding: 4px 12px;
|
||||
margin: 8px 0;
|
||||
color: var(--text-secondary);
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
.msg-text ul,
|
||||
.msg-text ol {
|
||||
padding-left: 20px;
|
||||
margin: 4px 0;
|
||||
}
|
||||
|
||||
.msg-text li {
|
||||
margin: 2px 0;
|
||||
}
|
||||
|
||||
.msg-text h1,
|
||||
.msg-text h2,
|
||||
.msg-text h3,
|
||||
.msg-text h4 {
|
||||
margin: 8px 0 4px;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.msg-text h3 {
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.msg-text h4 {
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
.msg-text a {
|
||||
color: var(--accent-primary);
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.msg-text code {
|
||||
background: var(--bg-tertiary);
|
||||
padding: 1px 4px;
|
||||
border-radius: 3px;
|
||||
font-family: var(--font-mono);
|
||||
font-size: 12px;
|
||||
}
|
||||
|
||||
.msg-text pre {
|
||||
background: var(--bg-tertiary);
|
||||
padding: 10px;
|
||||
border-radius: 6px;
|
||||
overflow-x: auto;
|
||||
margin: 6px 0;
|
||||
}
|
||||
|
||||
.msg-text pre code {
|
||||
background: none;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
.msg-text hr {
|
||||
border: none;
|
||||
border-top: 1px solid var(--border-subtle);
|
||||
margin: 8px 0;
|
||||
}
|
||||
|
||||
.msg-text strong {
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.chat-input-area {
|
||||
display: flex;
|
||||
align-items: flex-end;
|
||||
|
||||
@@ -103,6 +103,8 @@ class ChatPanel {
|
||||
case 'code': return this._renderCode(msg);
|
||||
case 'image': return this._renderImage(msg);
|
||||
case 'actions': return this._renderActions(msg);
|
||||
case 'user': return this._renderUser(msg);
|
||||
case 'status': return this._renderStatus(msg);
|
||||
default: return null;
|
||||
}
|
||||
}
|
||||
@@ -353,4 +355,24 @@ class ChatPanel {
|
||||
_scrollToBottom() {
|
||||
this.messagesEl.scrollTop = this.messagesEl.scrollHeight;
|
||||
}
|
||||
|
||||
/**
|
||||
* 사용자 메시지 렌더링
|
||||
*/
|
||||
_renderUser(msg) {
|
||||
const wrapper = document.createElement('div');
|
||||
wrapper.className = 'msg-user';
|
||||
wrapper.textContent = msg.content;
|
||||
return wrapper;
|
||||
}
|
||||
|
||||
/**
|
||||
* 상태 표시 (Running, Generating 등)
|
||||
*/
|
||||
_renderStatus(msg) {
|
||||
const wrapper = document.createElement('div');
|
||||
wrapper.className = 'msg-status';
|
||||
wrapper.textContent = msg.content;
|
||||
return wrapper;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,171 +121,152 @@ class CDPClient {
|
||||
if (!scrollEl) return JSON.stringify([]);
|
||||
|
||||
const messages = [];
|
||||
|
||||
// 뷰포트에 실제 렌더링된 최상위 컨테이너 찾기
|
||||
const topContainer = scrollEl.querySelector('.mx-auto.w-full > div > div');
|
||||
if (!topContainer) return JSON.stringify([]);
|
||||
|
||||
// 각 turn(대화 턴)을 순회
|
||||
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
|
||||
|
||||
// 유틸: 액션 버튼 추출
|
||||
function extractActions(container) {
|
||||
return Array.from(container.querySelectorAll('button')).map(b => {
|
||||
const label = b.textContent.trim();
|
||||
const rect = b.getBoundingClientRect();
|
||||
return {
|
||||
label,
|
||||
x: Math.round(rect.left + rect.width / 2),
|
||||
y: Math.round(rect.top + rect.height / 2),
|
||||
w: Math.round(rect.width),
|
||||
h: Math.round(rect.height),
|
||||
};
|
||||
}).filter(b => b.label && b.w > 0 && actionKeywords.some(k => b.label.includes(k)));
|
||||
}
|
||||
|
||||
// 유틸: 마크다운 영역에서 콘텐츠를 추출
|
||||
function extractContentBlocks(container) {
|
||||
// select-text 또는 leading-relaxed 마크다운 렌더링 영역 찾기
|
||||
const mkEls = container.querySelectorAll('.select-text .leading-relaxed, .leading-relaxed.select-text');
|
||||
|
||||
for (const mkEl of mkEls) {
|
||||
// style 태그 제거
|
||||
const clone = mkEl.cloneNode(true);
|
||||
clone.querySelectorAll('style').forEach(s => s.remove());
|
||||
|
||||
const html = clone.innerHTML;
|
||||
const text = clone.textContent.trim();
|
||||
|
||||
if (!text || text.length < 2) continue;
|
||||
// CSS 필터
|
||||
if (text.startsWith('/*') || text.includes('prefers-color-scheme')) continue;
|
||||
|
||||
messages.push({
|
||||
type: 'text',
|
||||
content: text.substring(0, 5000),
|
||||
html: html.substring(0, 10000),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// 각 turn을 순회
|
||||
const turns = topContainer.children;
|
||||
for (let i = 0; i < turns.length; i++) {
|
||||
const turn = turns[i];
|
||||
|
||||
// placeholder 블록 건너뛰기 (가상 스크롤)
|
||||
const isPlaceholder = turn.children.length > 0 &&
|
||||
// placeholder 건너뛰기
|
||||
if (turn.children.length > 0 &&
|
||||
Array.from(turn.children).every(c =>
|
||||
c.classList.contains('rounded-lg') &&
|
||||
c.classList.contains('bg-gray-500/10') &&
|
||||
c.textContent.trim() === ''
|
||||
);
|
||||
if (isPlaceholder) continue;
|
||||
)) continue;
|
||||
|
||||
// 턴 내부의 각 메시지 블록 순회
|
||||
const blocks = turn.querySelectorAll(':scope > *');
|
||||
for (const block of blocks) {
|
||||
// placeholder 개별 블록도 건너뛰기
|
||||
if (block.classList.contains('bg-gray-500/10') && block.textContent.trim() === '') continue;
|
||||
// style 태그 미리 제거
|
||||
turn.querySelectorAll('style').forEach(s => s.remove());
|
||||
|
||||
// 블록 내 style 태그 제거 (CSS 코드 누출 방지)
|
||||
block.querySelectorAll('style').forEach(s => s.remove());
|
||||
// --- 사용자 메시지 감지 (bg-gray-500/15 + select-text) ---
|
||||
const userMsgEl = turn.querySelector('.bg-gray-500\\\\/15.select-text, .bg-gray-500\\\\/15 .select-text');
|
||||
if (userMsgEl) {
|
||||
const text = userMsgEl.textContent.trim();
|
||||
if (text) {
|
||||
messages.push({ type: 'user', content: text.substring(0, 2000) });
|
||||
}
|
||||
}
|
||||
|
||||
// --- 작업 카드 (task boundary) ---
|
||||
const taskCard = block.querySelector('.isolate');
|
||||
if (taskCard || block.classList.contains('isolate')) {
|
||||
const card = taskCard || block;
|
||||
const titleEl = card.querySelector('.font-semibold');
|
||||
const summaryEl = card.querySelector('.text-sm .leading-relaxed');
|
||||
const expanded = card.querySelector('[aria-expanded]');
|
||||
|
||||
// 하위 항목들 추출
|
||||
const steps = [];
|
||||
card.querySelectorAll('.flex.items-center.gap-2, .flex.w-full.items-center.gap-2').forEach(step => {
|
||||
const txt = step.textContent.trim();
|
||||
if (txt && txt.length > 2) {
|
||||
const svg = step.querySelector('svg');
|
||||
let icon = '';
|
||||
if (svg) {
|
||||
const cls = svg.getAttribute('class') || '';
|
||||
if (cls.includes('check')) icon = '✓';
|
||||
else if (cls.includes('loader') || cls.includes('spin')) icon = '⟳';
|
||||
else if (cls.includes('x-circle') || cls.includes('alert')) icon = '⚠';
|
||||
}
|
||||
steps.push({ icon, text: txt.substring(0, 200) });
|
||||
// --- isolate 카드들 (task boundary) ---
|
||||
const isolates = turn.querySelectorAll('.isolate');
|
||||
for (const card of isolates) {
|
||||
const titleEl = card.querySelector('.font-semibold');
|
||||
const summaryEl = card.querySelector('.text-sm .leading-relaxed');
|
||||
const expanded = card.querySelector('[aria-expanded]');
|
||||
|
||||
const steps = [];
|
||||
card.querySelectorAll('.flex.items-center.gap-2, .flex.w-full.items-center.gap-2').forEach(step => {
|
||||
const txt = step.textContent.trim();
|
||||
if (txt && txt.length > 2) {
|
||||
const svg = step.querySelector('svg');
|
||||
let icon = '';
|
||||
if (svg) {
|
||||
const cls = svg.getAttribute('class') || '';
|
||||
if (cls.includes('check')) icon = '✓';
|
||||
else if (cls.includes('loader') || cls.includes('spin')) icon = '⟳';
|
||||
else if (cls.includes('x-circle') || cls.includes('alert')) icon = '⚠';
|
||||
}
|
||||
});
|
||||
|
||||
// 카드 내부 액션 버튼 추출 (Cancel, Review Changes 등)
|
||||
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
|
||||
const cardBtns = Array.from(card.querySelectorAll('button')).map(b => {
|
||||
const label = b.textContent.trim();
|
||||
const rect = b.getBoundingClientRect();
|
||||
return {
|
||||
label,
|
||||
x: Math.round(rect.left + rect.width / 2),
|
||||
y: Math.round(rect.top + rect.height / 2),
|
||||
w: Math.round(rect.width),
|
||||
h: Math.round(rect.height),
|
||||
};
|
||||
}).filter(b => b.label && b.w > 0 && actionKeywords.some(k => b.label.includes(k)));
|
||||
|
||||
messages.push({
|
||||
type: 'task',
|
||||
title: titleEl ? titleEl.textContent.trim() : '',
|
||||
summary: summaryEl ? summaryEl.textContent.trim().substring(0, 500) : '',
|
||||
collapsed: expanded ? expanded.getAttribute('aria-expanded') === 'false' : true,
|
||||
steps: steps.slice(0, 20),
|
||||
actions: cardBtns.slice(0, 5),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- Thought Process ---
|
||||
const thoughtBtn = block.querySelector('button');
|
||||
if (thoughtBtn && thoughtBtn.textContent.includes('Thought for')) {
|
||||
messages.push({
|
||||
type: 'thought',
|
||||
label: thoughtBtn.textContent.trim(),
|
||||
collapsed: true,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 코드 블록 ---
|
||||
const pre = block.querySelector('pre');
|
||||
if (pre && !block.querySelector('.isolate')) {
|
||||
const codeEl = pre.querySelector('code');
|
||||
const lang = codeEl ? (codeEl.className.match(/language-(\\w+)/) || [])[1] || '' : '';
|
||||
messages.push({
|
||||
type: 'code',
|
||||
language: lang,
|
||||
content: (codeEl || pre).textContent.substring(0, 2000),
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 이미지 ---
|
||||
const img = block.querySelector('img');
|
||||
if (img && img.src) {
|
||||
messages.push({
|
||||
type: 'image',
|
||||
src: img.src,
|
||||
alt: img.alt || '',
|
||||
width: img.naturalWidth || img.width || 200,
|
||||
height: img.naturalHeight || img.height || 150,
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
// --- 버튼 영역 (Proceed, Cancel 등) ---
|
||||
const actionBtns = block.querySelectorAll('button');
|
||||
if (actionBtns.length > 0) {
|
||||
const actionKeywords = ['Proceed','Cancel','Open','View','Review','Approve','Reject','Yes','No','Accept','Deny','Allow','Skip'];
|
||||
const buttons = Array.from(actionBtns).map(b => {
|
||||
const label = b.textContent.trim();
|
||||
const rect = b.getBoundingClientRect();
|
||||
return {
|
||||
label,
|
||||
x: Math.round(rect.left + rect.width / 2),
|
||||
y: Math.round(rect.top + rect.height / 2),
|
||||
w: Math.round(rect.width),
|
||||
h: Math.round(rect.height),
|
||||
};
|
||||
}).filter(b => b.label && b.w > 0);
|
||||
|
||||
if (buttons.length > 0 && buttons.some(b => actionKeywords.some(k => b.label.includes(k)))) {
|
||||
messages.push({
|
||||
type: 'actions',
|
||||
buttons: buttons.slice(0, 8),
|
||||
});
|
||||
continue;
|
||||
steps.push({ icon, text: txt.substring(0, 200) });
|
||||
}
|
||||
});
|
||||
|
||||
const cardBtns = extractActions(card);
|
||||
|
||||
messages.push({
|
||||
type: 'task',
|
||||
title: titleEl ? titleEl.textContent.trim() : '',
|
||||
summary: summaryEl ? summaryEl.textContent.trim().substring(0, 500) : '',
|
||||
collapsed: expanded ? expanded.getAttribute('aria-expanded') === 'false' : true,
|
||||
steps: steps.slice(0, 20),
|
||||
actions: cardBtns.slice(0, 5),
|
||||
});
|
||||
}
|
||||
|
||||
// --- Thought Process ---
|
||||
const thoughtBtns = turn.querySelectorAll('button');
|
||||
for (const btn of thoughtBtns) {
|
||||
if (btn.textContent.includes('Thought for')) {
|
||||
messages.push({ type: 'thought', label: btn.textContent.trim(), collapsed: true });
|
||||
}
|
||||
}
|
||||
|
||||
// --- 일반 텍스트 ---
|
||||
// style 태그 내용을 제외한 순수 텍스트만 추출
|
||||
const cloned = block.cloneNode(true);
|
||||
cloned.querySelectorAll('style').forEach(s => s.remove());
|
||||
const text = cloned.textContent.trim();
|
||||
if (text.length > 0) {
|
||||
// CSS 코드/내부 스타일 건너뛰기
|
||||
if (text.startsWith('/*') || text.startsWith('@media') ||
|
||||
text.includes('prefers-color-scheme') ||
|
||||
text.includes('{') && text.includes('}') && text.includes(':') && text.includes(';') && text.length < 2000 ||
|
||||
text.startsWith('.markdown-alert')) continue;
|
||||
// --- isolate 바깥의 마크다운 콘텐츠 ---
|
||||
// (isolate 내부가 아닌 마크다운 블록)
|
||||
const allMkEls = turn.querySelectorAll('.leading-relaxed.select-text, .select-text .leading-relaxed');
|
||||
for (const mkEl of allMkEls) {
|
||||
// isolate 내부면 건너뛰기 (이미 task로 처리)
|
||||
if (mkEl.closest('.isolate')) continue;
|
||||
|
||||
// leading-relaxed select-text → 마크다운 렌더링 텍스트
|
||||
const mkEl = block.querySelector('.leading-relaxed.select-text');
|
||||
// HTML에서도 style 태그를 DOM으로 제거
|
||||
const htmlSrc = mkEl || block;
|
||||
const htmlClone = htmlSrc.cloneNode(true);
|
||||
htmlClone.querySelectorAll('style').forEach(s => s.remove());
|
||||
const htmlContent = htmlClone.innerHTML;
|
||||
|
||||
messages.push({
|
||||
type: 'text',
|
||||
content: text.substring(0, 3000),
|
||||
html: htmlContent.substring(0, 5000),
|
||||
});
|
||||
const clone = mkEl.cloneNode(true);
|
||||
clone.querySelectorAll('style').forEach(s => s.remove());
|
||||
|
||||
const html = clone.innerHTML;
|
||||
const text = clone.textContent.trim();
|
||||
|
||||
if (!text || text.length < 2) continue;
|
||||
if (text.startsWith('/*') || text.includes('prefers-color-scheme')) continue;
|
||||
|
||||
messages.push({
|
||||
type: 'text',
|
||||
content: text.substring(0, 5000),
|
||||
html: html.substring(0, 10000),
|
||||
});
|
||||
}
|
||||
|
||||
// --- isolate 바깥 독립 코드/이미지/상태 ---
|
||||
const turnBlocks = turn.querySelectorAll(':scope > *');
|
||||
for (const block of turnBlocks) {
|
||||
if (block.querySelector('.isolate') || block.classList.contains('isolate')) continue;
|
||||
|
||||
// 상태 텍스트 (Running, Generating 등)
|
||||
if (block.classList.contains('whitespace-nowrap')) {
|
||||
const st = block.textContent.trim();
|
||||
if (st) messages.push({ type: 'status', content: st });
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user