find_duplicate

Code

/**
 * mcp/skills/findDuplicateFiles.ts — find_duplicate_files skill
 *
 * Finds duplicate files by comparing MD5 hashes. Scans a directory
 * recursively and groups files with identical content. Use when freeing
 * disk space by removing redundant copies.
 *
 * Platform strategy
 * -----------------
 * Both   Pure Node.js crypto.createHash('md5') with fs.createReadStream —
 *        cross-platform, no child_process needed.
 *
 * Smoke test
 *   npx tsx -r dotenv/config mcp/skills/findDuplicateFiles.ts
 */
 
import * as fs       from "fs";
import * as fsp      from "fs/promises";
import * as os       from "os";
import * as nodePath from "path";
import * as crypto   from "crypto";
import { z }         from "zod";
 
// -- Meta ---------------------------------------------------------------------
 
export const meta = {
  name: "find_duplicate_files",
  description:
    "Finds duplicate files by comparing MD5 hashes. Scans a directory " +
    "recursively and groups files with identical content. " +
    "Use when freeing disk space by removing redundant copies.",
  riskLevel:       "low",
  destructive:     false,
  requiresConsent: false,
  supportsDryRun:  false,
  affectedScope:   ["user"],
  auditRequired:   false,
  schema: {
    path: z
      .string()
      .optional()
      .describe("Directory to scan. Defaults to home directory"),
    minSizeMb: z
      .number()
      .optional()
      .describe("Minimum file size in MB to consider. Default: 1"),
    extensions: z
      .array(z.string())
      .optional()
      .describe("File extensions to check e.g. ['.jpg','.pdf']. Omit for all files"),
  },
} as const;
 
// -- Types --------------------------------------------------------------------
 
interface DuplicateFile {
  path: string;
  name: string;
}
 
interface DuplicateGroup {
  hash:    string;
  sizeMb:  number;
  files:   DuplicateFile[];
}
 
// -- Constants ----------------------------------------------------------------
 
const MAX_DEPTH = 10;
 
const SKIP_DIRS = new Set([
  "node_modules", ".git", ".npm", ".yarn", ".cache",
  "__pycache__", ".venv", "venv",
  "$Recycle.Bin", "System Volume Information",
]);
 
// -- Helpers ------------------------------------------------------------------
 
function hashFile(filePath: string): Promise<string> {
  return new Promise((resolve, reject) => {
    const hash   = crypto.createHash("md5");
    const stream = fs.createReadStream(filePath);
    stream.on("data", (chunk) => hash.update(chunk));
    stream.on("end",  () => resolve(hash.digest("hex")));
    stream.on("error", reject);
  });
}
 
interface WalkStats {
  dirsVisited:          number;
  dirsPermissionDenied: number;
}
 
/** True if a Node fs error looks like a TCC / OS permission denial. */
function isPermissionError(err: unknown): boolean {
  const code = (err as { code?: string })?.code;
  return code === "EPERM" || code === "EACCES";
}
 
async function walk(
  dir:        string,
  minBytes:   number,
  extensions: Set<string> | null,
  acc:        { path: string; size: number }[],
  depth:      number,
  stats:      WalkStats,
): Promise<void> {
  if (depth > MAX_DEPTH) return;
  stats.dirsVisited++;
 
  let entries: import("fs").Dirent[];
  try {
    entries = await fsp.readdir(dir, { withFileTypes: true });
  } catch (err) {
    // Track TCC / permission denials separately so the run() result can
    // surface a partial-coverage warning. Other errors (ENOENT, EBUSY)
    // are still ignored silently — they're not actionable.
    if (isPermissionError(err)) stats.dirsPermissionDenied++;
    return;
  }
 
  await Promise.allSettled(
    entries.map(async (e) => {
      if (e.name.startsWith(".") && depth > 0) return;
      const full = nodePath.join(dir, e.name);
 
      if (e.isDirectory()) {
        if (SKIP_DIRS.has(e.name)) return;
        await walk(full, minBytes, extensions, acc, depth + 1, stats);
      } else if (e.isFile()) {
        if (extensions && !extensions.has(nodePath.extname(e.name).toLowerCase())) return;
        try {
          const stat = await fsp.stat(full);
          if (stat.size >= minBytes) {
            acc.push({ path: full, size: stat.size });
          }
        } catch {
          // skip inaccessible files
        }
      }
    }),
  );
}
 
// -- Exported run function ----------------------------------------------------
 
export async function run({
  path: inputPath,
  minSizeMb  = 1,
  extensions,
}: {
  path?:       string;
  minSizeMb?:  number;
  extensions?: string[];
} = {}) {
  const home     = os.homedir();
  const scanPath = nodePath.resolve(inputPath ?? home);
 
  // Security: restrict scan to within home directory
  const rel = nodePath.relative(home, scanPath);
  if (rel.startsWith("..") || nodePath.isAbsolute(rel)) {
    throw new Error(
      `[find_duplicate_files] Path must be within home directory (${home}): ${scanPath}`,
    );
  }
 
  try {
    await fsp.access(scanPath);
  } catch {
    throw new Error(`[find_duplicate_files] Path not accessible: ${scanPath}`);
  }
 
  const minBytes   = Math.max(0, minSizeMb * 1024 * 1024);
  const extSet     = extensions && extensions.length > 0
    ? new Set(extensions.map((e) => (e.startsWith(".") ? e : `.${e}`).toLowerCase()))
    : null;
 
  const files: { path: string; size: number }[] = [];
  const walkStats: WalkStats = { dirsVisited: 0, dirsPermissionDenied: 0 };
  await walk(scanPath, minBytes, extSet, files, 0, walkStats);
 
  // Group by size first (cheap pre-filter before hashing)
  const bySize = new Map<number, typeof files>();
  for (const f of files) {
    const group = bySize.get(f.size) ?? [];
    group.push(f);
    bySize.set(f.size, group);
  }
 
  // Only hash files that share a size with at least one other file
  const candidates = [...bySize.values()].filter((g) => g.length > 1).flat();
 
  // Hash candidates and group by hash
  const byHash = new Map<string, typeof files>();
  await Promise.allSettled(
    candidates.map(async (f) => {
      try {
        const h     = await hashFile(f.path);
        const group = byHash.get(h) ?? [];
        group.push(f);
        byHash.set(h, group);
      } catch {
        // skip unreadable files
      }
    }),
  );
 
  // Build duplicate groups (2+ files with same hash)
  const duplicateGroups: DuplicateGroup[] = [];
  let totalWastedBytes = 0;
 
  for (const [hash, group] of byHash.entries()) {
    if (group.length < 2) continue;
    const sizeMb = Math.round((group[0].size / (1024 * 1024)) * 100) / 100;
    // Wasted space = (n-1) copies * size
    totalWastedBytes += (group.length - 1) * group[0].size;
    duplicateGroups.push({
      hash,
      sizeMb,
      files: group.map((f) => ({ path: f.path, name: nodePath.basename(f.path) })),
    });
  }
 
  // Sort by most wasted space first
  duplicateGroups.sort((a, b) => {
    const wastedA = (a.files.length - 1) * a.sizeMb;
    const wastedB = (b.files.length - 1) * b.sizeMb;
    return wastedB - wastedA;
  });
 
  const totalWastedMb = Math.round((totalWastedBytes / (1024 * 1024)) * 100) / 100;
 
  // ── Partial-result detection ────────────────────────────────────────────────
  // Same logic as get_large_files: when the walk skipped a meaningful share
  // of directories due to OS permission errors, the duplicate set is
  // necessarily incomplete. Surface so the user knows there may be more
  // duplicates in unscanned subtrees.
  let warning: string | undefined;
  if (
    walkStats.dirsVisited > 0 &&
    walkStats.dirsPermissionDenied / walkStats.dirsVisited > 0.2
  ) {
    warning =
      `Duplicate scan is incomplete: ${walkStats.dirsPermissionDenied} of ` +
      `${walkStats.dirsVisited} directories could not be read (likely missing ` +
      `Full Disk Access). Open System Settings → Privacy & Security → ` +
      `Full Disk Access, enable AI Support Agent, then quit and relaunch.`;
  }
 
  return {
    scannedPath:     scanPath,
    scannedFiles:    files.length,
    duplicateGroups,
    totalWastedMb,
    ...(warning ? { warning } : {}),
  };
}
 
// -- CLI smoke test -----------------------------------------------------------
 
if (false) {
  run({})
    .then(r => console.log(JSON.stringify(r, null, 2)))
    .catch((err: Error) => { console.error(err.message); process.exit(1); });
}
Idemeum

Library

find_duplicate_files

Metadata

Name

Source

Risk

Requires consent

Affected scope

Code

Graph View

Skills using this tool