/**
* mcp/skills/getLargeFiles.ts — get_large_files skill
*
* Recursively walks a directory and returns files whose size exceeds a
* threshold, sorted largest first. Complements disk_scan by identifying
* specific files (not just folders) that are consuming space.
*
* Platform strategy
* -----------------
* Both Pure Node.js fs.readdir + fs.stat — cross-platform, no shell needed.
*
* Smoke test
* npx tsx -r dotenv/config mcp/skills/getLargeFiles.ts [/path] [minMB] [limit]
*/
import * as fs from "fs/promises" ;
import * as os from "os" ;
import * as nodePath from "path" ;
import { z } from "zod" ;
// -- Meta ---------------------------------------------------------------------
export const meta = {
name: "get_large_files" ,
description:
"Recursively scans a directory and returns files whose size exceeds the " +
"given threshold, sorted largest first. " +
"Use to identify specific files consuming disk space after disk_scan " +
"has narrowed down the target directory." ,
riskLevel: "low" ,
destructive: false ,
requiresConsent: false ,
supportsDryRun: false ,
affectedScope: [ "user" ],
auditRequired: false ,
schema: {
path: z
. string ()
. optional ()
. describe (
"Absolute path of the directory to scan recursively. " +
"Defaults to the user home directory." ,
),
minSizeBytes: z
. number ()
. int ()
. positive ()
. optional ()
. describe ( "Only return files at least this large in bytes. Default: 104857600 (100 MB)." ),
limit: z
. number ()
. int ()
. positive ()
. optional ()
. describe ( "Maximum number of files to return. Default: 20." ),
},
} as const ;
// -- Constants ----------------------------------------------------------------
const DEFAULT_MIN_BYTES = 100 * 1024 * 1024 ; // 100 MB
const DEFAULT_LIMIT = 20 ;
const MAX_DEPTH = 12 ; // prevent stack overflow on deep trees
// Directories unlikely to contain user-owned deletable files.
const SKIP_DIRS = new Set ([
"node_modules" , ".git" , ".npm" , ".yarn" , ".cache" ,
"Library" , "__pycache__" , ".venv" , "venv" ,
"$Recycle.Bin" , "System Volume Information" , "Windows" ,
"Program Files" , "Program Files (x86)" ,
]);
// -- Helpers ------------------------------------------------------------------
function formatBytes ( bytes : number ) : string {
if (bytes === 0 ) return "0 B" ;
const units = [ "B" , "KB" , "MB" , "GB" , "TB" ];
const i = Math. min (Math. floor (Math. log (bytes) / Math. log ( 1024 )), units. length - 1 );
return `${ ( bytes / 1024 ** i ). toFixed ( 1 ) } ${ units [ i ] }` ;
}
interface FileEntry {
path : string ;
size : number ;
sizeHuman : string ;
modified : string ; // ISO 8601
}
// -- Recursive walker ---------------------------------------------------------
interface WalkStats {
dirsVisited : number ;
dirsPermissionDenied : number ;
}
/** True if a Node fs error looks like a TCC / OS permission denial. */
function isPermissionError ( err : unknown ) : boolean {
const code = (err as { code ?: string })?.code;
return code === "EPERM" || code === "EACCES" ;
}
async function walk (
dir : string ,
minSize : number ,
acc : FileEntry [],
depth : number ,
stats : WalkStats ,
) : Promise < void > {
if (depth > MAX_DEPTH ) return ;
stats.dirsVisited ++ ;
let entries : import ( "fs" ). Dirent < string >[];
try {
entries = await fs. readdir (dir, { withFileTypes: true });
} catch (err) {
// Track permission denials separately so the run() result can flag
// partial coverage. Non-permission errors (ENOENT, EBUSY, etc.) are
// ignored silently as before.
if ( isPermissionError (err)) stats.dirsPermissionDenied ++ ;
return ;
}
await Promise . allSettled (
entries. map ( async ( e ) => {
// Skip hidden entries at non-root depth (e.g. .git, .DS_Store)
if (e.name. startsWith ( "." ) && depth > 0 ) return ;
const full = nodePath. join (dir, e.name);
if (e. isDirectory ()) {
if ( SKIP_DIRS . has (e.name)) return ;
await walk (full, minSize, acc, depth + 1 , stats);
} else if (e. isFile ()) {
try {
const stat = await fs. stat (full);
if (stat.size >= minSize) {
acc. push ({
path: full,
size: stat.size,
sizeHuman: formatBytes (stat.size),
modified: stat.mtime. toISOString (),
});
}
} catch { /* unreadable file — skip */ }
}
}),
);
}
// -- Exported run function ----------------------------------------------------
export async function run ({
path : inputPath = os. homedir (),
minSizeBytes = DEFAULT_MIN_BYTES ,
limit = DEFAULT_LIMIT ,
} : {
path ?: string ;
minSizeBytes ?: number ;
limit ?: number ;
} = {}) {
const scanPath = nodePath. resolve (inputPath);
// Security: restrict scanning to within the user home directory.
// Prevents Claude from being directed to scan /etc, /var, or other
// system paths that could leak sensitive file names to the LLM context.
//
// Symlink defence: resolve the real path BEFORE checking against home.
// Without this a symlink inside ~/ pointing to /etc would bypass the
// relative-path check — nodePath.relative would see it as a child of home
// but fs.readdir would walk the symlink target.
let realScanPath : string ;
try {
realScanPath = await fs. realpath (scanPath);
} catch {
throw new Error ( `[get_large_files] Path not accessible: ${ scanPath }` );
}
const home = os. homedir ();
// Also resolve home so that macOS /var/folders symlinks are handled correctly
// (on macOS os.homedir() can return /Users/x while realpath gives the same).
const realHome = await fs. realpath (home). catch (() => home);
const rel = nodePath. relative (realHome, realScanPath);
if (rel. startsWith ( ".." ) || nodePath. isAbsolute (rel)) {
throw new Error (
`[get_large_files] Path must be within home directory` ,
);
}
const results : FileEntry [] = [];
const stats : WalkStats = { dirsVisited: 0 , dirsPermissionDenied: 0 };
await walk (realScanPath, minSizeBytes, results, 0 , stats);
results. sort (( a , b ) => b.size - a.size);
const files = results. slice ( 0 , limit);
// ── Partial-result detection ────────────────────────────────────────────────
// If a meaningful share of directories couldn't be read because of OS
// permission errors, the file list is incomplete and the user shouldn't
// trust it for cleanup decisions. Almost always a TCC denial — the agent
// doesn't have Full Disk Access and can't traverse into protected
// subtrees (~/Library, etc.).
let warning : string | undefined ;
if (
stats.dirsVisited > 0 &&
stats.dirsPermissionDenied / stats.dirsVisited > 0.2
) {
warning =
`Scan results are incomplete: ${ stats . dirsPermissionDenied } of ` +
`${ stats . dirsVisited } directories could not be read (likely missing ` +
`Full Disk Access). Open System Settings → Privacy & Security → ` +
`Full Disk Access, enable AI Support Agent, then quit and relaunch.` ;
}
return {
scannedPath: scanPath,
minSizeBytes,
minSizeHuman: formatBytes (minSizeBytes),
totalFound: results. length ,
returned: files. length ,
files,
... (warning ? { warning } : {}),
};
}
// -- CLI smoke test -----------------------------------------------------------
if (require.main === module ) {
const scanPath = process.argv[ 2 ] ?? os. homedir ();
const minMB = parseInt (process.argv[ 3 ] ?? "100" , 10 );
const limit = parseInt (process.argv[ 4 ] ?? "20" , 10 );
const minSizeBytes = minMB * 1024 * 1024 ;
console. log ( ` \n Scanning ${ scanPath } for files >= ${ minMB } MB (limit ${ limit })... \n ` );
run ({ path: scanPath, minSizeBytes, limit })
. then (( r ) => {
console. log ( `Found ${ r . totalFound } file(s) >= ${ r . minSizeHuman } — showing ${ r . returned } \n ` );
r.files. forEach (( f ) =>
console. log ( ` ${ f . sizeHuman . padStart ( 10 ) } ${ f . path }` ),
);
})
. catch (( err : Error ) => { console. error (err.message); process. exit ( 1 ); });
}