From 1fab2e09e8f20bd43a6c742b629610c33808b9f6 Mon Sep 17 00:00:00 2001 From: Kiril Markov Date: Mon, 12 Apr 2021 08:59:46 +0300 Subject: [PATCH] wip --- Files/Program.cs | 173 +++++++++++++------- Files/UnixFileRecord.cs | 172 ++++++++++++++++++++ Files/UnixFileSystemEnumerator.cs | 259 ++++++++++++++++++++++++++++++ 3 files changed, 545 insertions(+), 59 deletions(-) create mode 100644 Files/UnixFileRecord.cs create mode 100644 Files/UnixFileSystemEnumerator.cs diff --git a/Files/Program.cs b/Files/Program.cs index dc126f0..551ddbf 100644 --- a/Files/Program.cs +++ b/Files/Program.cs @@ -13,7 +13,7 @@ using System.Linq; using System.Reflection; using System.Reflection.Emit; using Dapper; -using System.Security.Cryptography; +using Mono.Unix.Native; namespace Files { @@ -113,7 +113,7 @@ namespace Files if (ct.IsCancellationRequested) return; - var sameSize = connection.Query("SELECT name, size, inode FROM files WHERE size = @size", + var sameSize = connection.Query("SELECT name, size, inode FROM files WHERE size = @size", new { potentialFile.size }).ToList(); var recordsWithErrors = sameSize @@ -137,9 +137,9 @@ namespace Files var records = grp.OrderByDescending(r => r.FileInfo.LinkCount).ToList(); - DbRecord head = records.First(); - var tail = records.Skip(1).Where(r => r.Inode != head.Inode).ToList(); - var tailWithDuplicates = records.Skip(1).Where(r => r.Inode == head.Inode).ToList(); + UnixFileRecord head = records.First(); + var tail = records.Skip(1).Where(r => r.INode != head.INode).ToList(); + var tailWithDuplicates = records.Skip(1).Where(r => r.INode == head.INode).ToList(); ByteSize totalSize = records.Distinct(new DbRecordEqualityComparerByINode()).Sum(a => a.Size) - head.Size; @@ -207,17 +207,31 @@ namespace Files private static async Task InitializeDb(SqliteConnection connection) { await connection.ExecuteAsync( - "CREATE TABLE IF NOT EXISTS files " + - "(name TEXT PRIMARY KEY, size INTEGER NOT NULL, inode INTEGER NOT NULL);"); + "CREATE TABLE IF NOT EXISTS files (" + + "name TEXT PRIMARY KEY, " + + "size INTEGER NOT NULL, " + + "inode INTEGER NOT NULL, " + + "hash TEXT);"); await connection.ExecuteAsync("CREATE INDEX IF NOT EXISTS idx_files_size ON files(size);"); + await connection.ExecuteAsync("CREATE INDEX IF NOT EXISTS idx_files_inode ON files(inode);"); } private static async Task Main(string[] args) { + /*foreach (var directoryName in UnixFileSystemEnumerator.EnumeratePaths("/home", + SearchTarget.DirectoriesAndFilesAndSymLinks, + true, true, default)) + { + Console.WriteLine(directoryName); + } + ; + return;*/ + var verboseOption = new Option(new[] { "--verbose", "-v" }, "Verbose"); var hardlinkOption = new Option(new[] { "--hardlink", "-l" }, "Hardlink duplicates"); var databaseOption = new Option(new[] { "--keep", "-k" }, () => true, "Keep database."); var scanOption = new Option(new[] { "--no-scan" }, "Do not scan file system. Reuse database."); + var dbFileOption = new Option(new[] { "--database", "-db" }, "Store database in file."); var directoryArgument = new Argument( result => new DirectoryInfo("./"), isDefault: true) { @@ -232,6 +246,7 @@ namespace Files hardlinkOption, databaseOption, scanOption, + dbFileOption, directoryArgument, }; @@ -249,16 +264,100 @@ namespace Files InitialDirectory = dir, KeepDatabase = result.ValueForOption(databaseOption), SkipFileScanning = result.ValueForOption(scanOption), + DatabaseFile = result.ValueForOption(dbFileOption), }; rootCommand.Handler = CommandHandler.Create( async ct => { - await IndexFiles(config, ct); + //await IndexFiles(config, ct); + await Begin(config, ct); }); await rootCommand.InvokeAsync(args); } + + private static async Task Begin(Configuration configuration, CancellationToken ct) => + await AnsiConsole.Status() + .StartAsync("Initializing...", async ctx => + { + string dbFileName = configuration.DatabaseFile?.FullName ?? ":memory:"; + await using var connection = new SqliteConnection($"Data Source={dbFileName};"); + connection.Open(); + await using var transaction = await connection.BeginTransactionAsync(ct); + + try + { + await InitializeDb(connection); + + if (!configuration.SkipFileScanning) + { + await ScanFiles(configuration, connection, ctx, ct); + } + + await transaction.CommitAsync(ct); + } + catch (Exception exception) + { + await transaction.RollbackAsync(); + AnsiConsole.WriteException(exception); + } + }); + + private static async Task ScanFiles(Configuration configuration, SqliteConnection sqliteConnection, + StatusContext statusContext, CancellationToken ct) + { + Func errorHandler = (path, errno) => + { + string errorDescription = UnixMarshal.GetErrorDescription(errno); + string safeErrorDescription = errorDescription + .Replace("[", "[[") + .Replace("]", "]]"); + string safePath = path + .Replace("[", "[[") + .Replace("]", "]]"); + AnsiConsole.MarkupLine($"[red]:cross_mark: {safeErrorDescription}:[/] :file_folder: {safePath}"); + return false; + }; + var pathEnumerable = UnixFileSystemEnumerator.EnumeratePaths( + configuration.InitialDirectory.ToString(), + SearchTarget.DirectoriesAndFiles, + true, + errorHandler, + ct); + + foreach (string entryPath in pathEnumerable) + { + if(!UnixFileSystemInfo.TryGetFileSystemEntry(entryPath, out var entry)) continue; + + string safeEntryPath = entryPath + .Replace("[", "[[") + .Replace("]", "]]"); + + if (entry.GetType() == typeof(UnixFileInfo)) // Faster than "is" + { + var file = (UnixFileInfo) entry; + var record = new UnixFileRecord(file); + + await sqliteConnection.ExecuteAsync("INSERT OR REPLACE INTO files (name, size, inode) VALUES (@Name, @Size, @INode);", record); + + if (configuration.BeVerbose) + AnsiConsole.MarkupLine($"[green]:check_mark: OK:[/] :page_facing_up: {safeEntryPath}"); + } + else if (entry.GetType() == typeof(UnixDirectoryInfo)) // Faster than "is" + { + var directory = (UnixDirectoryInfo)entry; + statusContext.Status(safeEntryPath); + } + else if (entry.GetType() == typeof(UnixSymbolicLinkInfo)) // Faster than "is" + { + var symLink = (UnixSymbolicLinkInfo)entry; + } + + if (ct.IsCancellationRequested) + return; + } + } } class Configuration @@ -268,63 +367,19 @@ namespace Files public DirectoryInfo InitialDirectory { get; set; } public bool KeepDatabase { get; set; } public bool SkipFileScanning { get; set; } + public FileInfo DatabaseFile { get; set; } } - public class DbRecord + public class DbRecordEqualityComparerByINode : EqualityComparer { - private readonly Lazy _guid; - private readonly Lazy _fileInfo; - - public DbRecord() + public override bool Equals(UnixFileRecord x, UnixFileRecord y) { - _guid = new Lazy(GetHash); - _fileInfo = new Lazy(GetFileInfo); + return x?.INode == y?.INode; } - public DbRecord(UnixFileInfo fileInfo) + public override int GetHashCode(UnixFileRecord obj) { - _guid = new Lazy(GetHash); - _fileInfo = new Lazy(fileInfo); - Name = fileInfo.GetOriginalPath(); - Size = fileInfo.Length; - Inode = fileInfo.Inode; - } - - public string Name { get; set; } - public long Size { get; set; } - public long Inode { get; set; } - - public Guid? Hash => _guid.Value; - public UnixFileInfo FileInfo => _fileInfo.Value; - - private UnixFileInfo GetFileInfo() => new(Name); - - private Guid? GetHash() - { - try - { - using FileStream stream = File.OpenRead(Name); - var md5 = MD5.Create(); - var bytes = md5.ComputeHash(stream); - return new Guid(bytes); - } - catch - { - return null; - } - } - } - - public class DbRecordEqualityComparerByINode : EqualityComparer - { - public override bool Equals(DbRecord x, DbRecord y) - { - return x?.Inode == y?.Inode; - } - - public override int GetHashCode(DbRecord obj) - { - return obj.Inode.GetHashCode(); + return obj.INode.GetHashCode(); } } @@ -347,6 +402,6 @@ namespace Files public static string GetOriginalPath(this UnixFileSystemInfo info) => GetOriginalPathFunc(info); - public static long GetSizeOnDisk(this UnixFileSystemInfo info) => info.BlockSize * info.BlocksAllocated; + public static long GetSizeOnDisk(this UnixFileSystemInfo info) => info.BlocksAllocated * 512; } } diff --git a/Files/UnixFileRecord.cs b/Files/UnixFileRecord.cs new file mode 100644 index 0000000..e902028 --- /dev/null +++ b/Files/UnixFileRecord.cs @@ -0,0 +1,172 @@ +using System; +using System.Buffers; +using System.IO; +using System.Security.Cryptography; +using System.Threading; +using System.Threading.Tasks; +using Mono.Unix; + +namespace Files +{ + public class UnixFileRecord + { + private readonly Lazy _guid; + private readonly Lazy _fileInfo; + + public UnixFileRecord() + { + _guid = new Lazy(GetHash); + _fileInfo = new Lazy(GetFileInfo); + } + + public UnixFileRecord(string filePath, long size, long iNode) + { + Name = filePath; + Size = size; + INode = iNode; + _guid = new Lazy(GetHash); + _fileInfo = new Lazy(GetFileInfo); + } + + public UnixFileRecord(UnixFileInfo fileInfo) + { + _guid = new Lazy(GetHash); + _fileInfo = new Lazy(fileInfo); + Name = fileInfo.GetOriginalPath(); + Size = fileInfo.Length; + INode = fileInfo.Inode; + } + + public UnixFileRecord(UnixFileInfo fileInfo, Guid hash) + { + _guid = new Lazy(hash); + _fileInfo = new Lazy(fileInfo); + Name = fileInfo.GetOriginalPath(); + Size = fileInfo.Length; + INode = fileInfo.Inode; + } + + public string Name { get; init; } + public long Size { get; init; } + public long INode { get; init; } + + public Guid? Hash => _guid.Value; + public UnixFileInfo FileInfo => _fileInfo.Value; + + private UnixFileInfo GetFileInfo() => new(Name); + + private Guid? GetHash() => GetHash(Name); + + private async Task GetHashAsync(CancellationToken ct = default) => await GetHashAsync(Name, ct); + + private Guid? GetHash2(CancellationToken ct = default) => GetHash2(Name, ct); + + private async Task GetHash2Async(CancellationToken ct = default) => await GetHash2Async(Name, ct); + + private static Guid? GetHash(string filePath) + { + try + { + using FileStream stream = File.OpenRead(filePath); + var md5 = MD5.Create(); + var bytes = md5.ComputeHash(stream); + return new Guid(bytes); + } + catch + { + return null; + } + } + + private static async Task GetHashAsync(string filePath, CancellationToken ct = default) + { + try + { + await using FileStream stream = File.OpenRead(filePath); + var md5 = MD5.Create(); + var bytes = await md5.ComputeHashAsync(stream, ct); + return new Guid(bytes); + } + catch + { + return null; + } + } + + private static Guid? GetHash2(string filePath, CancellationToken ct = default) + { + using IncrementalHash incrementalHash = IncrementalHash.CreateHash(HashAlgorithmName.MD5); + using FileStream inputStream = File.OpenRead(filePath); + + byte[] buffer = ArrayPool.Shared.Rent(4096); + + try + { + int bytesRead; + int clearLimit = 0; + + while ((bytesRead = inputStream.Read(buffer, 0, buffer.Length)) > 0) + { + if (bytesRead > clearLimit) + { + clearLimit = bytesRead; + } + + if (ct.IsCancellationRequested) return null; + + incrementalHash.AppendData(buffer, 0, bytesRead); + } + + byte[] hashBytes = incrementalHash.GetHashAndReset(); + return new Guid(hashBytes); + } + catch + { + return null; + } + finally + { + //CryptographicOperations.ZeroMemory(buffer.AsSpan(0, clearLimit)); + ArrayPool.Shared.Return(buffer, clearArray: false); + } + } + + private static async Task GetHash2Async(string filePath, CancellationToken ct = default) + { + using IncrementalHash incrementalHash = IncrementalHash.CreateHash(HashAlgorithmName.MD5); + await using FileStream inputStream = File.OpenRead(filePath); + + byte[] buffer = ArrayPool.Shared.Rent(4096); + + try + { + int bytesRead; + int clearLimit = 0; + + while ((bytesRead = await inputStream.ReadAsync(buffer, 0, buffer.Length, ct)) > 0) + { + if (bytesRead > clearLimit) + { + clearLimit = bytesRead; + } + + if (ct.IsCancellationRequested) return null; + + incrementalHash.AppendData(buffer, 0, bytesRead); + } + + byte[] hashBytes = incrementalHash.GetHashAndReset(); + return new Guid(hashBytes); + } + catch + { + return null; + } + finally + { + //CryptographicOperations.ZeroMemory(buffer.AsSpan(0, clearLimit)); + ArrayPool.Shared.Return(buffer, clearArray: false); + } + } + } +} \ No newline at end of file diff --git a/Files/UnixFileSystemEnumerator.cs b/Files/UnixFileSystemEnumerator.cs new file mode 100644 index 0000000..17f1d35 --- /dev/null +++ b/Files/UnixFileSystemEnumerator.cs @@ -0,0 +1,259 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Runtime.CompilerServices; +using System.Threading; +using Mono.Unix; +using Mono.Unix.Native; + +namespace Files +{ + [Flags] + public enum SearchTarget : byte + { + /// + /// Directories. + /// + Directories = 1, + + /// + /// Regular files. + /// + Files = 2, + + /// + /// Symbolic links. + /// + SymLinks = 4, + + /// + /// Named pipes, or FIFOs. + /// + NamedPipes = 8, + + /// + /// Local-domain socket. + /// + Sockets = 16, + + /// + /// Character devices. + /// + CharacterDevices = 32, + + /// + /// Block devices. + /// + BlockDevices = 64, + + DirectoriesAndFiles = Directories | Files, + DirectoriesAndFilesAndSymLinks = Directories | Files | SymLinks, + } + + /* File types for `d_type'. */ + internal static class DirentType + { + /// + /// The type is unknown. Only some filesystems have full support to return the type of the file, others might always return this value. + /// + public const byte DT_UNKNOWN = 0; + + /// + /// A named pipe, or FIFO. + /// + public const byte DT_FIFO = 1; + + /// + /// A character device. + /// + public const byte DT_CHR = 2; + + /// + /// A directory. + /// + public const byte DT_DIR = 4; + + /// + /// A block device. + /// + public const byte DT_BLK = 6; + + /// + /// A regular file. + /// + public const byte DT_REG = 8; + + /// + /// A symbolic link. + /// + public const byte DT_LNK = 10; + + /// + /// A local-domain socket. + /// + public const byte DT_SOCK = 12; + }; + + public static class UnixFileSystemEnumerator + { + private const string RelativeCurrentDir = "."; + private const string RelativeParentDir = ".."; + + public static IEnumerable EnumeratePaths( + string path, + SearchTarget searchTarget, + bool recurseSubdirectories, + bool ignoreInaccessible, + CancellationToken ct) + { + if (recurseSubdirectories) + { + return ignoreInaccessible + ? InternalRecurseEnumeratePathsWithoutExceptions(path, searchTarget, ct) + : InternalRecurseEnumeratePathsWithExceptions(path, searchTarget, ct); + } + + return ignoreInaccessible + ? InternalNoRecurseEnumeratePathsWithoutExceptions(path, searchTarget, ct) + : InternalNoRecurseEnumeratePathsWithExceptions(path, searchTarget, ct); + } + + private static IEnumerable GetEntries(IntPtr dirPointer, bool shouldThrow, CancellationToken ct) + { + IntPtr result; + int returnValue; + do + { + Dirent entry = new(); + returnValue = Syscall.readdir_r(dirPointer, entry, out result); + + if (ct.IsCancellationRequested) break; + + if (returnValue == 0 + && result != IntPtr.Zero + && (entry.d_name != RelativeCurrentDir && entry.d_name != RelativeParentDir)) + yield return entry; + } + while (returnValue == 0 && result != IntPtr.Zero); + if (returnValue != 0 && shouldThrow) + UnixMarshal.ThrowExceptionForLastErrorIf(returnValue); + } + + private static IEnumerable GetEntriesWithException(string directoryPath, CancellationToken ct) + { + IntPtr num = Syscall.opendir(directoryPath); + if (num == IntPtr.Zero) + UnixMarshal.ThrowExceptionForLastError(); + bool flag = false; + try + { + var entries = GetEntries(num, true, ct).ToList(); + flag = true; + return entries; + } + finally + { + int returnValue = Syscall.closedir(num); + if (flag) + UnixMarshal.ThrowExceptionForLastErrorIf(returnValue); + } + } + + private static IEnumerable GetEntriesNoException(string directoryPath, CancellationToken ct) + { + IntPtr num = Syscall.opendir(directoryPath); + if (num == IntPtr.Zero) + yield break; + + foreach (Dirent directoryEntry in GetEntries(num, false, ct)) + { + yield return directoryEntry; + } + + int returnValue = Syscall.closedir(num); + if (returnValue != 0) + UnixMarshal.ThrowExceptionForLastErrorIf(returnValue); + } + + private static IEnumerable InternalNoRecurseEnumeratePathsWithExceptions( + string path, SearchTarget searchTarget, CancellationToken ct) + { + foreach (Dirent entry in GetEntriesWithException(path, ct)) + { + if (IsTarget(entry, searchTarget)) + yield return entry.d_name; + } + } + + private static IEnumerable InternalNoRecurseEnumeratePathsWithoutExceptions( + string path, SearchTarget searchTarget, CancellationToken ct) + { + foreach (Dirent entry in GetEntriesNoException(path, ct)) + { + if (IsTarget(entry, searchTarget)) + yield return entry.d_name; + } + } + + private static IEnumerable InternalRecurseEnumeratePathsWithExceptions( + string path, SearchTarget searchTarget, CancellationToken ct) + { + Stack directoriesStack = new Stack(); + directoriesStack.Push(path); + + while (directoriesStack.TryPop(out string dir)) + { + foreach (Dirent entry in GetEntriesWithException(dir, ct)) + { + string combinedPath = Path.Combine(dir, entry.d_name); + + if (entry.d_type == 4) // Directory + { + directoriesStack.Push(combinedPath); + } + + if (IsTarget(entry, searchTarget)) yield return combinedPath; + } + } + } + + private static IEnumerable InternalRecurseEnumeratePathsWithoutExceptions( + string path, SearchTarget searchTarget, CancellationToken ct) + { + Stack directoriesStack = new Stack(); + directoriesStack.Push(path); + + while (directoriesStack.TryPop(out string dir)) + { + foreach (Dirent entry in GetEntriesNoException(dir, ct)) + { + string combinedPath = Path.Combine(dir, entry.d_name); + + if (entry.d_type == 4) // Directory + { + directoriesStack.Push(combinedPath); + } + + if (IsTarget(entry, searchTarget)) yield return combinedPath; + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool IsTarget(Dirent entry, SearchTarget desiredTarget) + { + return entry.d_type switch + { + DirentType.DT_DIR => (desiredTarget & SearchTarget.Directories) == SearchTarget.Directories, + DirentType.DT_REG => (desiredTarget & SearchTarget.Files) == SearchTarget.Files, + DirentType.DT_LNK => (desiredTarget & SearchTarget.SymLinks) == SearchTarget.SymLinks, + DirentType.DT_FIFO => (desiredTarget & SearchTarget.NamedPipes) == SearchTarget.NamedPipes, + DirentType.DT_SOCK => (desiredTarget & SearchTarget.Sockets) == SearchTarget.Sockets, + DirentType.DT_CHR => (desiredTarget & SearchTarget.CharacterDevices) == SearchTarget.CharacterDevices, + DirentType.DT_BLK => (desiredTarget & SearchTarget.BlockDevices) == SearchTarget.BlockDevices, + _ => false + }; + } + } +} \ No newline at end of file