Files
guitar_score/.agent/services/claude-mem/scripts/cleanup-duplicates.ts

244 lines
8.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bun
/**
* Cleanup script for duplicate observations created by the batching bug.
*
* The bug: When multiple messages were batched together, observations were stored
* once per message ID instead of once per observation. For example, if 4 messages
* were batched and produced 3 observations, those 3 observations were stored
* 12 times (4×3) instead of 3 times.
*
* This script identifies duplicates by matching on:
* - memory_session_id (same session)
* - text (same content)
* - type (same observation type)
* - created_at_epoch within 60 seconds (same batch window)
*
* Usage:
* bun scripts/cleanup-duplicates.ts # Dry run (default)
* bun scripts/cleanup-duplicates.ts --execute # Actually delete duplicates
*/
import { Database } from 'bun:sqlite';
import { homedir } from 'os';
import { join } from 'path';
const DB_PATH = join(homedir(), '.claude-mem', 'claude-mem.db');
// Time window modes for duplicate detection
const TIME_WINDOW_MODES = {
strict: 5, // 5 seconds - only exact duplicates from same batch
normal: 60, // 60 seconds - duplicates within same minute
aggressive: 0, // 0 = ignore time entirely, match on session+text+type only
};
interface DuplicateGroup {
memory_session_id: string;
title: string;
type: string;
epoch_bucket: number;
count: number;
ids: number[];
keep_id: number;
delete_ids: number[];
}
interface ObservationRow {
id: number;
memory_session_id: string;
title: string | null;
subtitle: string | null;
narrative: string | null;
type: string;
created_at_epoch: number;
}
function main() {
const dryRun = !process.argv.includes('--execute');
const aggressive = process.argv.includes('--aggressive');
const strict = process.argv.includes('--strict');
// Determine time window
let windowMode: keyof typeof TIME_WINDOW_MODES = 'normal';
if (aggressive) windowMode = 'aggressive';
if (strict) windowMode = 'strict';
const batchWindowSeconds = TIME_WINDOW_MODES[windowMode];
console.log('='.repeat(60));
console.log('Claude-Mem Duplicate Observation Cleanup');
console.log('='.repeat(60));
console.log(`Mode: ${dryRun ? 'DRY RUN (use --execute to delete)' : 'EXECUTE'}`);
console.log(`Database: ${DB_PATH}`);
console.log(`Time window: ${windowMode} (${batchWindowSeconds === 0 ? 'ignore time' : batchWindowSeconds + ' seconds'})`);
console.log('');
console.log('Options:');
console.log(' --execute Actually delete duplicates (default: dry run)');
console.log(' --strict 5-second window (exact batch duplicates only)');
console.log(' --aggressive Ignore time, match on session+text+type only');
console.log('');
const db = dryRun
? new Database(DB_PATH, { readonly: true })
: new Database(DB_PATH);
// Get total observation count
const totalCount = db.prepare('SELECT COUNT(*) as count FROM observations').get() as { count: number };
console.log(`Total observations in database: ${totalCount.count}`);
// Find all observations and group by content fingerprint
const observations = db.prepare(`
SELECT
id,
memory_session_id,
title,
subtitle,
narrative,
type,
created_at_epoch
FROM observations
ORDER BY memory_session_id, title, type, created_at_epoch
`).all() as ObservationRow[];
console.log(`Analyzing ${observations.length} observations for duplicates...`);
console.log('');
// Group observations by fingerprint (session + text + type + time bucket)
const groups = new Map<string, ObservationRow[]>();
for (const obs of observations) {
// Skip observations without title (can't dedupe without content identifier)
if (obs.title === null) continue;
// Create content hash from title + subtitle + narrative
const contentKey = `${obs.title}|${obs.subtitle || ''}|${obs.narrative || ''}`;
// Create fingerprint based on time window mode
let fingerprint: string;
if (batchWindowSeconds === 0) {
// Aggressive mode: ignore time entirely
fingerprint = `${obs.memory_session_id}|${obs.type}|${contentKey}`;
} else {
// Normal/strict mode: include time bucket
const epochBucket = Math.floor(obs.created_at_epoch / batchWindowSeconds);
fingerprint = `${obs.memory_session_id}|${obs.type}|${epochBucket}|${contentKey}`;
}
if (!groups.has(fingerprint)) {
groups.set(fingerprint, []);
}
groups.get(fingerprint)!.push(obs);
}
// Find groups with duplicates
const duplicateGroups: DuplicateGroup[] = [];
for (const [fingerprint, rows] of groups) {
if (rows.length > 1) {
// Sort by id to keep the oldest (lowest id)
rows.sort((a, b) => a.id - b.id);
const keepId = rows[0].id;
const deleteIds = rows.slice(1).map(r => r.id);
// SAFETY: Never delete all copies - always keep at least one
if (deleteIds.length >= rows.length) {
throw new Error(`SAFETY VIOLATION: Would delete all ${rows.length} copies! Aborting.`);
}
if (!deleteIds.every(id => id !== keepId)) {
throw new Error(`SAFETY VIOLATION: Delete list contains keep_id ${keepId}! Aborting.`);
}
const title = rows[0].title || '';
duplicateGroups.push({
memory_session_id: rows[0].memory_session_id,
title: title.substring(0, 100) + (title.length > 100 ? '...' : ''),
type: rows[0].type,
epoch_bucket: batchWindowSeconds > 0 ? Math.floor(rows[0].created_at_epoch / batchWindowSeconds) : 0,
count: rows.length,
ids: rows.map(r => r.id),
keep_id: keepId,
delete_ids: deleteIds,
});
}
}
if (duplicateGroups.length === 0) {
console.log('No duplicate observations found!');
db.close();
return;
}
// Calculate stats
const totalDuplicates = duplicateGroups.reduce((sum, g) => sum + g.delete_ids.length, 0);
const affectedSessions = new Set(duplicateGroups.map(g => g.memory_session_id)).size;
console.log('DUPLICATE ANALYSIS:');
console.log('-'.repeat(60));
console.log(`Duplicate groups found: ${duplicateGroups.length}`);
console.log(`Total duplicates to remove: ${totalDuplicates}`);
console.log(`Affected sessions: ${affectedSessions}`);
console.log(`Observations after cleanup: ${totalCount.count - totalDuplicates}`);
console.log('');
// Show sample of duplicates
console.log('SAMPLE DUPLICATES (first 10 groups):');
console.log('-'.repeat(60));
for (const group of duplicateGroups.slice(0, 10)) {
console.log(`Session: ${group.memory_session_id.substring(0, 20)}...`);
console.log(`Type: ${group.type}`);
console.log(`Count: ${group.count} copies (keeping id=${group.keep_id}, deleting ${group.delete_ids.length})`);
console.log(`Title: "${group.title}"`);
console.log('');
}
if (duplicateGroups.length > 10) {
console.log(`... and ${duplicateGroups.length - 10} more groups`);
console.log('');
}
// Execute deletion if not dry run
if (!dryRun) {
console.log('EXECUTING DELETION...');
console.log('-'.repeat(60));
const allDeleteIds = duplicateGroups.flatMap(g => g.delete_ids);
// Delete in batches of 500 to avoid SQLite limits
const BATCH_SIZE = 500;
let deleted = 0;
db.exec('BEGIN TRANSACTION');
try {
for (let i = 0; i < allDeleteIds.length; i += BATCH_SIZE) {
const batch = allDeleteIds.slice(i, i + BATCH_SIZE);
const placeholders = batch.map(() => '?').join(',');
const stmt = db.prepare(`DELETE FROM observations WHERE id IN (${placeholders})`);
const result = stmt.run(...batch);
deleted += result.changes;
console.log(`Deleted batch ${Math.floor(i / BATCH_SIZE) + 1}: ${result.changes} observations`);
}
db.exec('COMMIT');
console.log('');
console.log(`Successfully deleted ${deleted} duplicate observations!`);
// Verify final count
const finalCount = db.prepare('SELECT COUNT(*) as count FROM observations').get() as { count: number };
console.log(`Final observation count: ${finalCount.count}`);
} catch (error) {
db.exec('ROLLBACK');
console.error('Error during deletion, rolled back:', error);
process.exit(1);
}
} else {
console.log('DRY RUN COMPLETE');
console.log('-'.repeat(60));
console.log('No changes were made. Run with --execute to delete duplicates.');
}
db.close();
}
main();