This commit is contained in:
2021-04-12 08:59:46 +03:00
parent 80a42d9d0a
commit 1fab2e09e8
3 changed files with 545 additions and 59 deletions

View File

@@ -13,7 +13,7 @@ using System.Linq;
using System.Reflection;
using System.Reflection.Emit;
using Dapper;
using System.Security.Cryptography;
using Mono.Unix.Native;
namespace Files
{
@@ -113,7 +113,7 @@ namespace Files
if (ct.IsCancellationRequested)
return;
var sameSize = connection.Query<DbRecord>("SELECT name, size, inode FROM files WHERE size = @size",
var sameSize = connection.Query<UnixFileRecord>("SELECT name, size, inode FROM files WHERE size = @size",
new { potentialFile.size }).ToList();
var recordsWithErrors = sameSize
@@ -137,9 +137,9 @@ namespace Files
var records = grp.OrderByDescending(r => r.FileInfo.LinkCount).ToList();
DbRecord head = records.First();
var tail = records.Skip(1).Where(r => r.Inode != head.Inode).ToList();
var tailWithDuplicates = records.Skip(1).Where(r => r.Inode == head.Inode).ToList();
UnixFileRecord head = records.First();
var tail = records.Skip(1).Where(r => r.INode != head.INode).ToList();
var tailWithDuplicates = records.Skip(1).Where(r => r.INode == head.INode).ToList();
ByteSize totalSize = records.Distinct(new DbRecordEqualityComparerByINode()).Sum(a => a.Size) - head.Size;
@@ -207,17 +207,31 @@ namespace Files
private static async Task InitializeDb(SqliteConnection connection)
{
await connection.ExecuteAsync(
"CREATE TABLE IF NOT EXISTS files " +
"(name TEXT PRIMARY KEY, size INTEGER NOT NULL, inode INTEGER NOT NULL);");
"CREATE TABLE IF NOT EXISTS files (" +
"name TEXT PRIMARY KEY, " +
"size INTEGER NOT NULL, " +
"inode INTEGER NOT NULL, " +
"hash TEXT);");
await connection.ExecuteAsync("CREATE INDEX IF NOT EXISTS idx_files_size ON files(size);");
await connection.ExecuteAsync("CREATE INDEX IF NOT EXISTS idx_files_inode ON files(inode);");
}
private static async Task Main(string[] args)
{
/*foreach (var directoryName in UnixFileSystemEnumerator.EnumeratePaths("/home",
SearchTarget.DirectoriesAndFilesAndSymLinks,
true, true, default))
{
Console.WriteLine(directoryName);
}
;
return;*/
var verboseOption = new Option<bool>(new[] { "--verbose", "-v" }, "Verbose");
var hardlinkOption = new Option<bool>(new[] { "--hardlink", "-l" }, "Hardlink duplicates");
var databaseOption = new Option<bool>(new[] { "--keep", "-k" }, () => true, "Keep database.");
var scanOption = new Option<bool>(new[] { "--no-scan" }, "Do not scan file system. Reuse database.");
var dbFileOption = new Option<FileInfo>(new[] { "--database", "-db" }, "Store database in file.");
var directoryArgument = new Argument<DirectoryInfo>(
result => new DirectoryInfo("./"), isDefault: true)
{
@@ -232,6 +246,7 @@ namespace Files
hardlinkOption,
databaseOption,
scanOption,
dbFileOption,
directoryArgument,
};
@@ -249,16 +264,100 @@ namespace Files
InitialDirectory = dir,
KeepDatabase = result.ValueForOption(databaseOption),
SkipFileScanning = result.ValueForOption(scanOption),
DatabaseFile = result.ValueForOption(dbFileOption),
};
rootCommand.Handler = CommandHandler.Create<CancellationToken>(
async ct =>
{
await IndexFiles(config, ct);
//await IndexFiles(config, ct);
await Begin(config, ct);
});
await rootCommand.InvokeAsync(args);
}
private static async Task Begin(Configuration configuration, CancellationToken ct) =>
await AnsiConsole.Status()
.StartAsync("Initializing...", async ctx =>
{
string dbFileName = configuration.DatabaseFile?.FullName ?? ":memory:";
await using var connection = new SqliteConnection($"Data Source={dbFileName};");
connection.Open();
await using var transaction = await connection.BeginTransactionAsync(ct);
try
{
await InitializeDb(connection);
if (!configuration.SkipFileScanning)
{
await ScanFiles(configuration, connection, ctx, ct);
}
await transaction.CommitAsync(ct);
}
catch (Exception exception)
{
await transaction.RollbackAsync();
AnsiConsole.WriteException(exception);
}
});
private static async Task ScanFiles(Configuration configuration, SqliteConnection sqliteConnection,
StatusContext statusContext, CancellationToken ct)
{
Func<string, Errno, bool> errorHandler = (path, errno) =>
{
string errorDescription = UnixMarshal.GetErrorDescription(errno);
string safeErrorDescription = errorDescription
.Replace("[", "[[")
.Replace("]", "]]");
string safePath = path
.Replace("[", "[[")
.Replace("]", "]]");
AnsiConsole.MarkupLine($"[red]:cross_mark: {safeErrorDescription}:[/] :file_folder: {safePath}");
return false;
};
var pathEnumerable = UnixFileSystemEnumerator.EnumeratePaths(
configuration.InitialDirectory.ToString(),
SearchTarget.DirectoriesAndFiles,
true,
errorHandler,
ct);
foreach (string entryPath in pathEnumerable)
{
if(!UnixFileSystemInfo.TryGetFileSystemEntry(entryPath, out var entry)) continue;
string safeEntryPath = entryPath
.Replace("[", "[[")
.Replace("]", "]]");
if (entry.GetType() == typeof(UnixFileInfo)) // Faster than "is"
{
var file = (UnixFileInfo) entry;
var record = new UnixFileRecord(file);
await sqliteConnection.ExecuteAsync("INSERT OR REPLACE INTO files (name, size, inode) VALUES (@Name, @Size, @INode);", record);
if (configuration.BeVerbose)
AnsiConsole.MarkupLine($"[green]:check_mark: OK:[/] :page_facing_up: {safeEntryPath}");
}
else if (entry.GetType() == typeof(UnixDirectoryInfo)) // Faster than "is"
{
var directory = (UnixDirectoryInfo)entry;
statusContext.Status(safeEntryPath);
}
else if (entry.GetType() == typeof(UnixSymbolicLinkInfo)) // Faster than "is"
{
var symLink = (UnixSymbolicLinkInfo)entry;
}
if (ct.IsCancellationRequested)
return;
}
}
}
class Configuration
@@ -268,63 +367,19 @@ namespace Files
public DirectoryInfo InitialDirectory { get; set; }
public bool KeepDatabase { get; set; }
public bool SkipFileScanning { get; set; }
public FileInfo DatabaseFile { get; set; }
}
public class DbRecord
public class DbRecordEqualityComparerByINode : EqualityComparer<UnixFileRecord>
{
private readonly Lazy<Guid?> _guid;
private readonly Lazy<UnixFileInfo> _fileInfo;
public DbRecord()
public override bool Equals(UnixFileRecord x, UnixFileRecord y)
{
_guid = new Lazy<Guid?>(GetHash);
_fileInfo = new Lazy<UnixFileInfo>(GetFileInfo);
return x?.INode == y?.INode;
}
public DbRecord(UnixFileInfo fileInfo)
public override int GetHashCode(UnixFileRecord obj)
{
_guid = new Lazy<Guid?>(GetHash);
_fileInfo = new Lazy<UnixFileInfo>(fileInfo);
Name = fileInfo.GetOriginalPath();
Size = fileInfo.Length;
Inode = fileInfo.Inode;
}
public string Name { get; set; }
public long Size { get; set; }
public long Inode { get; set; }
public Guid? Hash => _guid.Value;
public UnixFileInfo FileInfo => _fileInfo.Value;
private UnixFileInfo GetFileInfo() => new(Name);
private Guid? GetHash()
{
try
{
using FileStream stream = File.OpenRead(Name);
var md5 = MD5.Create();
var bytes = md5.ComputeHash(stream);
return new Guid(bytes);
}
catch
{
return null;
}
}
}
public class DbRecordEqualityComparerByINode : EqualityComparer<DbRecord>
{
public override bool Equals(DbRecord x, DbRecord y)
{
return x?.Inode == y?.Inode;
}
public override int GetHashCode(DbRecord obj)
{
return obj.Inode.GetHashCode();
return obj.INode.GetHashCode();
}
}
@@ -347,6 +402,6 @@ namespace Files
public static string GetOriginalPath(this UnixFileSystemInfo info) => GetOriginalPathFunc(info);
public static long GetSizeOnDisk(this UnixFileSystemInfo info) => info.BlockSize * info.BlocksAllocated;
public static long GetSizeOnDisk(this UnixFileSystemInfo info) => info.BlocksAllocated * 512;
}
}

172
Files/UnixFileRecord.cs Normal file
View File

@@ -0,0 +1,172 @@
using System;
using System.Buffers;
using System.IO;
using System.Security.Cryptography;
using System.Threading;
using System.Threading.Tasks;
using Mono.Unix;
namespace Files
{
public class UnixFileRecord
{
private readonly Lazy<Guid?> _guid;
private readonly Lazy<UnixFileInfo> _fileInfo;
public UnixFileRecord()
{
_guid = new Lazy<Guid?>(GetHash);
_fileInfo = new Lazy<UnixFileInfo>(GetFileInfo);
}
public UnixFileRecord(string filePath, long size, long iNode)
{
Name = filePath;
Size = size;
INode = iNode;
_guid = new Lazy<Guid?>(GetHash);
_fileInfo = new Lazy<UnixFileInfo>(GetFileInfo);
}
public UnixFileRecord(UnixFileInfo fileInfo)
{
_guid = new Lazy<Guid?>(GetHash);
_fileInfo = new Lazy<UnixFileInfo>(fileInfo);
Name = fileInfo.GetOriginalPath();
Size = fileInfo.Length;
INode = fileInfo.Inode;
}
public UnixFileRecord(UnixFileInfo fileInfo, Guid hash)
{
_guid = new Lazy<Guid?>(hash);
_fileInfo = new Lazy<UnixFileInfo>(fileInfo);
Name = fileInfo.GetOriginalPath();
Size = fileInfo.Length;
INode = fileInfo.Inode;
}
public string Name { get; init; }
public long Size { get; init; }
public long INode { get; init; }
public Guid? Hash => _guid.Value;
public UnixFileInfo FileInfo => _fileInfo.Value;
private UnixFileInfo GetFileInfo() => new(Name);
private Guid? GetHash() => GetHash(Name);
private async Task<Guid?> GetHashAsync(CancellationToken ct = default) => await GetHashAsync(Name, ct);
private Guid? GetHash2(CancellationToken ct = default) => GetHash2(Name, ct);
private async Task<Guid?> GetHash2Async(CancellationToken ct = default) => await GetHash2Async(Name, ct);
private static Guid? GetHash(string filePath)
{
try
{
using FileStream stream = File.OpenRead(filePath);
var md5 = MD5.Create();
var bytes = md5.ComputeHash(stream);
return new Guid(bytes);
}
catch
{
return null;
}
}
private static async Task<Guid?> GetHashAsync(string filePath, CancellationToken ct = default)
{
try
{
await using FileStream stream = File.OpenRead(filePath);
var md5 = MD5.Create();
var bytes = await md5.ComputeHashAsync(stream, ct);
return new Guid(bytes);
}
catch
{
return null;
}
}
private static Guid? GetHash2(string filePath, CancellationToken ct = default)
{
using IncrementalHash incrementalHash = IncrementalHash.CreateHash(HashAlgorithmName.MD5);
using FileStream inputStream = File.OpenRead(filePath);
byte[] buffer = ArrayPool<byte>.Shared.Rent(4096);
try
{
int bytesRead;
int clearLimit = 0;
while ((bytesRead = inputStream.Read(buffer, 0, buffer.Length)) > 0)
{
if (bytesRead > clearLimit)
{
clearLimit = bytesRead;
}
if (ct.IsCancellationRequested) return null;
incrementalHash.AppendData(buffer, 0, bytesRead);
}
byte[] hashBytes = incrementalHash.GetHashAndReset();
return new Guid(hashBytes);
}
catch
{
return null;
}
finally
{
//CryptographicOperations.ZeroMemory(buffer.AsSpan(0, clearLimit));
ArrayPool<byte>.Shared.Return(buffer, clearArray: false);
}
}
private static async Task<Guid?> GetHash2Async(string filePath, CancellationToken ct = default)
{
using IncrementalHash incrementalHash = IncrementalHash.CreateHash(HashAlgorithmName.MD5);
await using FileStream inputStream = File.OpenRead(filePath);
byte[] buffer = ArrayPool<byte>.Shared.Rent(4096);
try
{
int bytesRead;
int clearLimit = 0;
while ((bytesRead = await inputStream.ReadAsync(buffer, 0, buffer.Length, ct)) > 0)
{
if (bytesRead > clearLimit)
{
clearLimit = bytesRead;
}
if (ct.IsCancellationRequested) return null;
incrementalHash.AppendData(buffer, 0, bytesRead);
}
byte[] hashBytes = incrementalHash.GetHashAndReset();
return new Guid(hashBytes);
}
catch
{
return null;
}
finally
{
//CryptographicOperations.ZeroMemory(buffer.AsSpan(0, clearLimit));
ArrayPool<byte>.Shared.Return(buffer, clearArray: false);
}
}
}
}

View File

@@ -0,0 +1,259 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
using Mono.Unix;
using Mono.Unix.Native;
namespace Files
{
[Flags]
public enum SearchTarget : byte
{
/// <summary>
/// Directories.
/// </summary>
Directories = 1,
/// <summary>
/// Regular files.
/// </summary>
Files = 2,
/// <summary>
/// Symbolic links.
/// </summary>
SymLinks = 4,
/// <summary>
/// Named pipes, or FIFOs.
/// </summary>
NamedPipes = 8,
/// <summary>
/// Local-domain socket.
/// </summary>
Sockets = 16,
/// <summary>
/// Character devices.
/// </summary>
CharacterDevices = 32,
/// <summary>
/// Block devices.
/// </summary>
BlockDevices = 64,
DirectoriesAndFiles = Directories | Files,
DirectoriesAndFilesAndSymLinks = Directories | Files | SymLinks,
}
/* File types for `d_type'. */
internal static class DirentType
{
/// <summary>
/// The type is unknown. Only some filesystems have full support to return the type of the file, others might always return this value.
/// </summary>
public const byte DT_UNKNOWN = 0;
/// <summary>
/// A named pipe, or FIFO.
/// </summary>
public const byte DT_FIFO = 1;
/// <summary>
/// A character device.
/// </summary>
public const byte DT_CHR = 2;
/// <summary>
/// A directory.
/// </summary>
public const byte DT_DIR = 4;
/// <summary>
/// A block device.
/// </summary>
public const byte DT_BLK = 6;
/// <summary>
/// A regular file.
/// </summary>
public const byte DT_REG = 8;
/// <summary>
/// A symbolic link.
/// </summary>
public const byte DT_LNK = 10;
/// <summary>
/// A local-domain socket.
/// </summary>
public const byte DT_SOCK = 12;
};
public static class UnixFileSystemEnumerator
{
private const string RelativeCurrentDir = ".";
private const string RelativeParentDir = "..";
public static IEnumerable<string> EnumeratePaths(
string path,
SearchTarget searchTarget,
bool recurseSubdirectories,
bool ignoreInaccessible,
CancellationToken ct)
{
if (recurseSubdirectories)
{
return ignoreInaccessible
? InternalRecurseEnumeratePathsWithoutExceptions(path, searchTarget, ct)
: InternalRecurseEnumeratePathsWithExceptions(path, searchTarget, ct);
}
return ignoreInaccessible
? InternalNoRecurseEnumeratePathsWithoutExceptions(path, searchTarget, ct)
: InternalNoRecurseEnumeratePathsWithExceptions(path, searchTarget, ct);
}
private static IEnumerable<Dirent> GetEntries(IntPtr dirPointer, bool shouldThrow, CancellationToken ct)
{
IntPtr result;
int returnValue;
do
{
Dirent entry = new();
returnValue = Syscall.readdir_r(dirPointer, entry, out result);
if (ct.IsCancellationRequested) break;
if (returnValue == 0
&& result != IntPtr.Zero
&& (entry.d_name != RelativeCurrentDir && entry.d_name != RelativeParentDir))
yield return entry;
}
while (returnValue == 0 && result != IntPtr.Zero);
if (returnValue != 0 && shouldThrow)
UnixMarshal.ThrowExceptionForLastErrorIf(returnValue);
}
private static IEnumerable<Dirent> GetEntriesWithException(string directoryPath, CancellationToken ct)
{
IntPtr num = Syscall.opendir(directoryPath);
if (num == IntPtr.Zero)
UnixMarshal.ThrowExceptionForLastError();
bool flag = false;
try
{
var entries = GetEntries(num, true, ct).ToList();
flag = true;
return entries;
}
finally
{
int returnValue = Syscall.closedir(num);
if (flag)
UnixMarshal.ThrowExceptionForLastErrorIf(returnValue);
}
}
private static IEnumerable<Dirent> GetEntriesNoException(string directoryPath, CancellationToken ct)
{
IntPtr num = Syscall.opendir(directoryPath);
if (num == IntPtr.Zero)
yield break;
foreach (Dirent directoryEntry in GetEntries(num, false, ct))
{
yield return directoryEntry;
}
int returnValue = Syscall.closedir(num);
if (returnValue != 0)
UnixMarshal.ThrowExceptionForLastErrorIf(returnValue);
}
private static IEnumerable<string> InternalNoRecurseEnumeratePathsWithExceptions(
string path, SearchTarget searchTarget, CancellationToken ct)
{
foreach (Dirent entry in GetEntriesWithException(path, ct))
{
if (IsTarget(entry, searchTarget))
yield return entry.d_name;
}
}
private static IEnumerable<string> InternalNoRecurseEnumeratePathsWithoutExceptions(
string path, SearchTarget searchTarget, CancellationToken ct)
{
foreach (Dirent entry in GetEntriesNoException(path, ct))
{
if (IsTarget(entry, searchTarget))
yield return entry.d_name;
}
}
private static IEnumerable<string> InternalRecurseEnumeratePathsWithExceptions(
string path, SearchTarget searchTarget, CancellationToken ct)
{
Stack<string> directoriesStack = new Stack<string>();
directoriesStack.Push(path);
while (directoriesStack.TryPop(out string dir))
{
foreach (Dirent entry in GetEntriesWithException(dir, ct))
{
string combinedPath = Path.Combine(dir, entry.d_name);
if (entry.d_type == 4) // Directory
{
directoriesStack.Push(combinedPath);
}
if (IsTarget(entry, searchTarget)) yield return combinedPath;
}
}
}
private static IEnumerable<string> InternalRecurseEnumeratePathsWithoutExceptions(
string path, SearchTarget searchTarget, CancellationToken ct)
{
Stack<string> directoriesStack = new Stack<string>();
directoriesStack.Push(path);
while (directoriesStack.TryPop(out string dir))
{
foreach (Dirent entry in GetEntriesNoException(dir, ct))
{
string combinedPath = Path.Combine(dir, entry.d_name);
if (entry.d_type == 4) // Directory
{
directoriesStack.Push(combinedPath);
}
if (IsTarget(entry, searchTarget)) yield return combinedPath;
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsTarget(Dirent entry, SearchTarget desiredTarget)
{
return entry.d_type switch
{
DirentType.DT_DIR => (desiredTarget & SearchTarget.Directories) == SearchTarget.Directories,
DirentType.DT_REG => (desiredTarget & SearchTarget.Files) == SearchTarget.Files,
DirentType.DT_LNK => (desiredTarget & SearchTarget.SymLinks) == SearchTarget.SymLinks,
DirentType.DT_FIFO => (desiredTarget & SearchTarget.NamedPipes) == SearchTarget.NamedPipes,
DirentType.DT_SOCK => (desiredTarget & SearchTarget.Sockets) == SearchTarget.Sockets,
DirentType.DT_CHR => (desiredTarget & SearchTarget.CharacterDevices) == SearchTarget.CharacterDevices,
DirentType.DT_BLK => (desiredTarget & SearchTarget.BlockDevices) == SearchTarget.BlockDevices,
_ => false
};
}
}
}