Optimizing enumeration

This commit is contained in:
2021-04-13 12:14:55 +03:00
parent 1fab2e09e8
commit cf67f0e59a
3 changed files with 365 additions and 151 deletions

View File

@@ -116,6 +116,8 @@ namespace Files
var sameSize = connection.Query<UnixFileRecord>("SELECT name, size, inode FROM files WHERE size = @size",
new { potentialFile.size }).ToList();
sameSize.CalculateHashes();
var recordsWithErrors = sameSize
.Where(r => !r.Hash.HasValue);
@@ -218,15 +220,6 @@ namespace Files
private static async Task Main(string[] args)
{
/*foreach (var directoryName in UnixFileSystemEnumerator.EnumeratePaths("/home",
SearchTarget.DirectoriesAndFilesAndSymLinks,
true, true, default))
{
Console.WriteLine(directoryName);
}
;
return;*/
var verboseOption = new Option<bool>(new[] { "--verbose", "-v" }, "Verbose");
var hardlinkOption = new Option<bool>(new[] { "--hardlink", "-l" }, "Hardlink duplicates");
var databaseOption = new Option<bool>(new[] { "--keep", "-k" }, () => true, "Keep database.");
@@ -295,8 +288,15 @@ namespace Files
await ScanFiles(configuration, connection, ctx, ct);
}
FindDuplicates(configuration, connection, ctx, ct);
await transaction.CommitAsync(ct);
}
catch (OperationCanceledException)
{
await transaction.RollbackAsync();
AnsiConsole.WriteLine("Canceled!");
}
catch (Exception exception)
{
await transaction.RollbackAsync();
@@ -307,28 +307,85 @@ namespace Files
private static async Task ScanFiles(Configuration configuration, SqliteConnection sqliteConnection,
StatusContext statusContext, CancellationToken ct)
{
Func<string, Errno, bool> errorHandler = (path, errno) =>
UnixFileSystemEnumerator.FilterEnumeratorDelegate filter = (directory, entry, entryType, errno) => true;
var pathEnumerable = UnixFileSystemEnumerator.EnumeratePaths(
configuration.InitialDirectory.ToString(),
filter,
ct);
foreach ((string entryPath, byte entryType, Errno errno) in pathEnumerable)
{
if (errno != 0)
{
string errorDescription = UnixMarshal.GetErrorDescription(errno);
string safeErrorDescription = errorDescription
.Replace("[", "[[")
.Replace("]", "]]");
string safePath = path
string safePath = entryPath
.Replace("[", "[[")
.Replace("]", "]]");
AnsiConsole.MarkupLine($"[red]:cross_mark: {safeErrorDescription}:[/] :file_folder: {safePath}");
return false;
};
var pathEnumerable = UnixFileSystemEnumerator.EnumeratePaths(
configuration.InitialDirectory.ToString(),
SearchTarget.DirectoriesAndFiles,
true,
errorHandler,
ct);
foreach (string entryPath in pathEnumerable)
continue;
}
string entryTypeEmoji = entryType switch
{
if(!UnixFileSystemInfo.TryGetFileSystemEntry(entryPath, out var entry)) continue;
DirentType.DT_DIR => Emoji.Known.FileFolder,
DirentType.DT_REG => Emoji.Known.PageFacingUp,
DirentType.DT_LNK => Emoji.Known.Link,
DirentType.DT_BLK => Emoji.Known.ComputerDisk,
DirentType.DT_CHR => Emoji.Known.Keyboard,
DirentType.DT_FIFO => Emoji.Known.PButton,
DirentType.DT_SOCK => Emoji.Known.ElectricPlug,
DirentType.DT_UNKNOWN => Emoji.Known.Potato,
_ => Emoji.Known.PileOfPoo,
};
if (!UnixFileSystemEnumerator.IsOfTarget(entryType, SearchTarget.DirectoriesAndFiles))
{
if(!configuration.BeVerbose) continue;
string safePath = entryPath
.Replace("[", "[[")
.Replace("]", "]]");
string fileType = entryType switch
{
DirentType.DT_DIR => "Directory",
DirentType.DT_REG => "Regular file",
DirentType.DT_LNK => "Symbolic link",
DirentType.DT_BLK => "Block device",
DirentType.DT_CHR => "Character device",
DirentType.DT_FIFO => "Named pipe",
DirentType.DT_SOCK => "Socket",
DirentType.DT_UNKNOWN => "UNKNOWN",
_ => "WRONG",
};
AnsiConsole.MarkupLine($"[yellow]{Emoji.Known.FastForwardButton} {fileType}:[/] {entryTypeEmoji} {safePath}");
continue;
}
UnixFileSystemInfo entry = entryType switch
{
DirentType.DT_REG => new UnixFileInfo(entryPath),
DirentType.DT_DIR => new UnixDirectoryInfo(entryPath),
DirentType.DT_LNK => new UnixSymbolicLinkInfo(entryPath),
_ => throw new FileLoadException($"Cannot scan {entryTypeEmoji} {entryPath}"),
};
if (!entry.GetValid())
{
string errorDescription = UnixMarshal.GetErrorDescription(Stdlib.GetLastError());
string safePath = entryPath
.Replace("[", "[[")
.Replace("]", "]]");
AnsiConsole.MarkupLine($"[red]:cross_mark: {errorDescription}:[/] {entryTypeEmoji} {safePath}");
continue;
}
string safeEntryPath = entryPath
.Replace("[", "[[")
@@ -342,7 +399,7 @@ namespace Files
await sqliteConnection.ExecuteAsync("INSERT OR REPLACE INTO files (name, size, inode) VALUES (@Name, @Size, @INode);", record);
if (configuration.BeVerbose)
AnsiConsole.MarkupLine($"[green]:check_mark: OK:[/] :page_facing_up: {safeEntryPath}");
AnsiConsole.MarkupLine($"[green]:check_mark: OK:[/] {entryTypeEmoji} {safeEntryPath}");
}
else if (entry.GetType() == typeof(UnixDirectoryInfo)) // Faster than "is"
{
@@ -358,6 +415,110 @@ namespace Files
return;
}
}
private static void FindDuplicates(Configuration configuration,
SqliteConnection connection, StatusContext ctx,
CancellationToken ct)
{
ctx.Status("Finding duplicates...");
ctx.Spinner(Spinner.Known.Aesthetic);
var potential = connection.Query<(int cnt, long size)>(
"SELECT COUNT(*) cnt, size FROM files WHERE size != 0 GROUP BY size HAVING cnt > 1 ORDER BY size * cnt DESC;");
foreach (var potentialFile in potential)
{
ct.ThrowIfCancellationRequested();
var sameSize = connection.Query<UnixFileRecord>("SELECT name, size, inode FROM files WHERE size = @size",
new { potentialFile.size }).ToList();
var recordsWithErrors = sameSize
.Where(r => !r.Hash.HasValue);
foreach (var dbRecord in recordsWithErrors)
{
AnsiConsole.MarkupLine(
$"[red]:cross_mark: NO_ACCESS:[/] :page_facing_up: {dbRecord.Name.Replace("[", "[[").Replace("]", "]]")}");
}
var equalGrouped = sameSize
.Where(r => r.Hash.HasValue)
.GroupBy(r => r.Hash)
.Where(g => g.Count() > 1)
.ToList();
foreach (var grp in equalGrouped)
{
ct.ThrowIfCancellationRequested();
var records = grp.OrderByDescending(r => r.FileInfo.LinkCount).ToList();
UnixFileRecord head = records.First();
var tail = records.Skip(1).Where(r => r.INode != head.INode).ToList();
var tailWithDuplicates = records.Skip(1).Where(r => r.INode == head.INode).ToList();
ByteSize totalSize = records.Distinct(new DbRecordEqualityComparerByINode()).Sum(a => a.Size) - head.Size;
var root = new Tree((head.Size + totalSize).ToStringWithDecimalPrefixedShortUnitName() + " total.");
root.AddNode(((ByteSize)head.Size).ToStringWithDecimalPrefixedShortUnitName() + " " +
head.Name.Replace("[", "[[").Replace("]", "]]"));
foreach (var item in tail)
{
if (configuration.EnableLinking)
{
try
{
// First rename
string tempFileName = item.FileInfo.FullName + ".to_hardlink";
File.Move(item.FileInfo.FullName, tempFileName);
try
{
// Then hardlink
head.FileInfo.CreateLink(item.FileInfo.FullName);
// Then delete
File.Delete(tempFileName);
root.AddNode("[green]:check_mark:[/] " +
item.Name.Replace("[", "[[").Replace("]", "]]"));
}
catch (Exception)
{
File.Move(tempFileName, item.FileInfo.FullName);
throw;
}
}
catch (Exception exception)
{
AnsiConsole.WriteException(exception, ExceptionFormats.ShortenEverything);
root.AddNode("[red]:cross_mark:[/] " +
item.Name.Replace("[", "[[").Replace("]", "]]"));
}
}
else
{
root.AddNode(((ByteSize)item.Size).ToStringWithDecimalPrefixedShortUnitName() + " " +
item.Name.Replace("[", "[[").Replace("]", "]]"));
}
}
if (configuration.BeVerbose)
foreach (var duplicate in tailWithDuplicates)
{
root.AddNode("[white]:anchor:[/] 0B " +
duplicate.Name.Replace("[", "[[").Replace("]", "]]"));
}
if (tail.Any() || configuration.BeVerbose)
{
AnsiConsole.Render(root);
AnsiConsole.WriteLine();
}
}
}
}
}
class Configuration
@@ -386,22 +547,35 @@ namespace Files
static class OriginalPathUnixFileSystemInfo
{
private static readonly Func<UnixFileSystemInfo, string> GetOriginalPathFunc;
private static readonly Func<UnixFileSystemInfo, bool> GetValidFunc;
static OriginalPathUnixFileSystemInfo()
{
var method = new DynamicMethod("cheat", typeof(string), new[] { typeof(UnixFileSystemInfo) }, typeof(UnixFileSystemInfo), true);
var il = method.GetILGenerator();
il.Emit(OpCodes.Ldarg_0);
il.Emit(OpCodes.Castclass, typeof(UnixFileSystemInfo));
//il.Emit(OpCodes.Castclass, typeof(UnixFileSystemInfo));
il.Emit(OpCodes.Callvirt, typeof(UnixFileSystemInfo)
.GetProperty("OriginalPath", BindingFlags.Instance | BindingFlags.Public | BindingFlags.NonPublic)
.GetGetMethod(true));
il.Emit(OpCodes.Ret);
GetOriginalPathFunc = (Func<UnixFileSystemInfo, string>)method.CreateDelegate(typeof(Func<UnixFileSystemInfo, string>));
var method2 = new DynamicMethod("cheat2", typeof(bool), new[] { typeof(UnixFileSystemInfo) }, typeof(UnixFileSystemInfo), true);
var il2 = method2.GetILGenerator();
il2.Emit(OpCodes.Ldarg_0);
//il2.Emit(OpCodes.Castclass, typeof(UnixFileSystemInfo));
il2.Emit(OpCodes.Ldfld, typeof(UnixFileSystemInfo)
.GetField("valid", BindingFlags.Instance | BindingFlags.NonPublic));
il2.Emit(OpCodes.Ret);
GetValidFunc = (Func<UnixFileSystemInfo, bool>)method2.CreateDelegate(typeof(Func<UnixFileSystemInfo, bool>));
}
public static string GetOriginalPath(this UnixFileSystemInfo info) => GetOriginalPathFunc(info);
public static bool GetValid(this UnixFileSystemInfo info) => GetValidFunc(info);
public static long GetSizeOnDisk(this UnixFileSystemInfo info) => info.BlocksAllocated * 512;
}
}

View File

@@ -1,6 +1,8 @@
using System;
using System.Buffers;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Security.Cryptography;
using System.Threading;
using System.Threading.Tasks;
@@ -8,10 +10,28 @@ using Mono.Unix;
namespace Files
{
public static class UnixFileRecordExtensions
{
public static void CalculateHashes(this IEnumerable<UnixFileRecord> records)
{
foreach (var recordsGroup in records.GroupBy(r => r.INode))
{
Guid? hash = UnixFileRecord.GetHash(recordsGroup.First().FileInfo);
if (!hash.HasValue) continue;
foreach (UnixFileRecord unixFileRecord in recordsGroup)
{
unixFileRecord.SetHash(hash.Value);
}
}
}
}
public class UnixFileRecord
{
private readonly Lazy<Guid?> _guid;
private readonly Lazy<UnixFileInfo> _fileInfo;
private Guid? _preCalculatedHash = null;
public UnixFileRecord()
{
@@ -53,9 +73,11 @@ namespace Files
public Guid? Hash => _guid.Value;
public UnixFileInfo FileInfo => _fileInfo.Value;
public void SetHash(Guid hash) => _preCalculatedHash = hash;
private UnixFileInfo GetFileInfo() => new(Name);
private Guid? GetHash() => GetHash(Name);
private Guid? GetHash() => _preCalculatedHash ??= GetHash(Name);
private async Task<Guid?> GetHashAsync(CancellationToken ct = default) => await GetHashAsync(Name, ct);
@@ -63,7 +85,7 @@ namespace Files
private async Task<Guid?> GetHash2Async(CancellationToken ct = default) => await GetHash2Async(Name, ct);
private static Guid? GetHash(string filePath)
public static Guid? GetHash(string filePath)
{
try
{
@@ -78,7 +100,22 @@ namespace Files
}
}
private static async Task<Guid?> GetHashAsync(string filePath, CancellationToken ct = default)
public static Guid? GetHash(UnixFileInfo file)
{
try
{
using UnixStream stream = file.Open(FileMode.Open);
var md5 = MD5.Create();
var bytes = md5.ComputeHash(stream);
return new Guid(bytes);
}
catch
{
return null;
}
}
public static async Task<Guid?> GetHashAsync(string filePath, CancellationToken ct = default)
{
try
{
@@ -93,7 +130,22 @@ namespace Files
}
}
private static Guid? GetHash2(string filePath, CancellationToken ct = default)
public static async Task<Guid?> GetHashAsync(UnixFileInfo file, CancellationToken ct = default)
{
try
{
await using UnixStream stream = file.Open(FileMode.Open);
var md5 = MD5.Create();
var bytes = await md5.ComputeHashAsync(stream, ct);
return new Guid(bytes);
}
catch
{
return null;
}
}
public static Guid? GetHash2(string filePath, CancellationToken ct = default)
{
using IncrementalHash incrementalHash = IncrementalHash.CreateHash(HashAlgorithmName.MD5);
using FileStream inputStream = File.OpenRead(filePath);
@@ -131,7 +183,7 @@ namespace Files
}
}
private static async Task<Guid?> GetHash2Async(string filePath, CancellationToken ct = default)
public static async Task<Guid?> GetHash2Async(string filePath, CancellationToken ct = default)
{
using IncrementalHash incrementalHash = IncrementalHash.CreateHash(HashAlgorithmName.MD5);
await using FileStream inputStream = File.OpenRead(filePath);

View File

@@ -1,7 +1,6 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Threading;
using Mono.Unix;
@@ -52,7 +51,7 @@ namespace Files
}
/* File types for `d_type'. */
internal static class DirentType
public static class DirentType
{
/// <summary>
/// The type is unknown. Only some filesystems have full support to return the type of the file, others might always return this value.
@@ -93,34 +92,23 @@ namespace Files
/// A local-domain socket.
/// </summary>
public const byte DT_SOCK = 12;
};
}
public static class UnixFileSystemEnumerator
{
private const string RelativeCurrentDir = ".";
private const string RelativeParentDir = "..";
public static IEnumerable<string> EnumeratePaths(
string path,
SearchTarget searchTarget,
bool recurseSubdirectories,
bool ignoreInaccessible,
CancellationToken ct)
public static bool TryGetEntries(string directoryPath, out List<Dirent> list, out Errno error)
{
if (recurseSubdirectories)
list = new List<Dirent>();
IntPtr dirPointer = Syscall.opendir(directoryPath);
if (dirPointer == IntPtr.Zero)
{
return ignoreInaccessible
? InternalRecurseEnumeratePathsWithoutExceptions(path, searchTarget, ct)
: InternalRecurseEnumeratePathsWithExceptions(path, searchTarget, ct);
error = Stdlib.GetLastError();
return false;
}
return ignoreInaccessible
? InternalNoRecurseEnumeratePathsWithoutExceptions(path, searchTarget, ct)
: InternalNoRecurseEnumeratePathsWithExceptions(path, searchTarget, ct);
}
private static IEnumerable<Dirent> GetEntries(IntPtr dirPointer, bool shouldThrow, CancellationToken ct)
{
IntPtr result;
int returnValue;
do
@@ -128,120 +116,63 @@ namespace Files
Dirent entry = new();
returnValue = Syscall.readdir_r(dirPointer, entry, out result);
if (ct.IsCancellationRequested) break;
if (returnValue == 0
&& result != IntPtr.Zero
&& (entry.d_name != RelativeCurrentDir && entry.d_name != RelativeParentDir))
yield return entry;
if (returnValue == 0 && result != IntPtr.Zero && (entry.d_name != RelativeCurrentDir && entry.d_name != RelativeParentDir))
{
list.Add(entry);
}
}
while (returnValue == 0 && result != IntPtr.Zero);
if (returnValue != 0 && shouldThrow)
UnixMarshal.ThrowExceptionForLastErrorIf(returnValue);
if (returnValue == 0)
{
error = 0;
return true;
}
private static IEnumerable<Dirent> GetEntriesWithException(string directoryPath, CancellationToken ct)
{
IntPtr num = Syscall.opendir(directoryPath);
if (num == IntPtr.Zero)
UnixMarshal.ThrowExceptionForLastError();
bool flag = false;
try
{
var entries = GetEntries(num, true, ct).ToList();
flag = true;
return entries;
}
finally
{
int returnValue = Syscall.closedir(num);
if (flag)
UnixMarshal.ThrowExceptionForLastErrorIf(returnValue);
}
error = Stdlib.GetLastError();
return false;
}
private static IEnumerable<Dirent> GetEntriesNoException(string directoryPath, CancellationToken ct)
{
IntPtr num = Syscall.opendir(directoryPath);
if (num == IntPtr.Zero)
yield break;
public delegate bool FilterEnumeratorDelegate(string directory, string directoryEntry, byte entryType,
Errno errno);
foreach (Dirent directoryEntry in GetEntries(num, false, ct))
public static IEnumerable<(string path, byte type, Errno errno)> EnumeratePaths(
string path, FilterEnumeratorDelegate filter = null, CancellationToken ct = default)
{
yield return directoryEntry;
LinkedList<string> directoriesStack = new LinkedList<string>();
directoriesStack.AddLast(path);
while (directoriesStack.Last != null)
{
string dir = directoriesStack.Last.ValueRef;
directoriesStack.RemoveLast();
if (!TryGetEntries(dir, out List<Dirent> entries, out Errno errno))
{
if(!(filter?.Invoke(dir, ".", DirentType.DT_DIR, errno) ?? true)) yield break;
yield return (dir, DirentType.DT_DIR, errno);
}
int returnValue = Syscall.closedir(num);
if (returnValue != 0)
UnixMarshal.ThrowExceptionForLastErrorIf(returnValue);
}
foreach (Dirent entry in entries)
{
ct.ThrowIfCancellationRequested();
private static IEnumerable<string> InternalNoRecurseEnumeratePathsWithExceptions(
string path, SearchTarget searchTarget, CancellationToken ct)
{
foreach (Dirent entry in GetEntriesWithException(path, ct))
{
if (IsTarget(entry, searchTarget))
yield return entry.d_name;
}
}
if (!(filter?.Invoke(dir, entry.d_name, entry.d_type, 0) ?? true)) continue;
private static IEnumerable<string> InternalNoRecurseEnumeratePathsWithoutExceptions(
string path, SearchTarget searchTarget, CancellationToken ct)
{
foreach (Dirent entry in GetEntriesNoException(path, ct))
{
if (IsTarget(entry, searchTarget))
yield return entry.d_name;
}
}
private static IEnumerable<string> InternalRecurseEnumeratePathsWithExceptions(
string path, SearchTarget searchTarget, CancellationToken ct)
{
Stack<string> directoriesStack = new Stack<string>();
directoriesStack.Push(path);
while (directoriesStack.TryPop(out string dir))
{
foreach (Dirent entry in GetEntriesWithException(dir, ct))
{
string combinedPath = Path.Combine(dir, entry.d_name);
if (entry.d_type == 4) // Directory
if (entry.d_type == DirentType.DT_DIR) // Directory
{
directoriesStack.Push(combinedPath);
directoriesStack.AddLast(combinedPath);
}
if (IsTarget(entry, searchTarget)) yield return combinedPath;
}
}
}
private static IEnumerable<string> InternalRecurseEnumeratePathsWithoutExceptions(
string path, SearchTarget searchTarget, CancellationToken ct)
{
Stack<string> directoriesStack = new Stack<string>();
directoriesStack.Push(path);
while (directoriesStack.TryPop(out string dir))
{
foreach (Dirent entry in GetEntriesNoException(dir, ct))
{
string combinedPath = Path.Combine(dir, entry.d_name);
if (entry.d_type == 4) // Directory
{
directoriesStack.Push(combinedPath);
}
if (IsTarget(entry, searchTarget)) yield return combinedPath;
yield return (combinedPath, entry.d_type, 0);
}
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool IsTarget(Dirent entry, SearchTarget desiredTarget)
public static bool IsOfTarget(this Dirent entry, SearchTarget desiredTarget)
{
return entry.d_type switch
{
@@ -255,5 +186,62 @@ namespace Files
_ => false
};
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsOfTarget(byte entryType, SearchTarget desiredTarget)
{
return entryType switch
{
DirentType.DT_DIR => (desiredTarget & SearchTarget.Directories) == SearchTarget.Directories,
DirentType.DT_REG => (desiredTarget & SearchTarget.Files) == SearchTarget.Files,
DirentType.DT_LNK => (desiredTarget & SearchTarget.SymLinks) == SearchTarget.SymLinks,
DirentType.DT_FIFO => (desiredTarget & SearchTarget.NamedPipes) == SearchTarget.NamedPipes,
DirentType.DT_SOCK => (desiredTarget & SearchTarget.Sockets) == SearchTarget.Sockets,
DirentType.DT_CHR => (desiredTarget & SearchTarget.CharacterDevices) == SearchTarget.CharacterDevices,
DirentType.DT_BLK => (desiredTarget & SearchTarget.BlockDevices) == SearchTarget.BlockDevices,
_ => false
};
}
public static Exception CreateExceptionForError(this Errno errno)
{
string errorDescription = UnixMarshal.GetErrorDescription(errno);
UnixIOException unixIoException = new(errno);
switch (errno)
{
case Errno.EPERM:
case Errno.EOPNOTSUPP:
return new InvalidOperationException(errorDescription, unixIoException);
case Errno.ENOENT:
return new FileNotFoundException(errorDescription, unixIoException);
case Errno.EIO:
case Errno.ENXIO:
case Errno.ENOSPC:
case Errno.ESPIPE:
case Errno.EROFS:
case Errno.ENOTEMPTY:
return new IOException(errorDescription, unixIoException);
case Errno.ENOEXEC:
return new InvalidProgramException(errorDescription, unixIoException);
case Errno.EBADF:
case Errno.EINVAL:
return new ArgumentException(errorDescription, unixIoException);
case Errno.EACCES:
case Errno.EISDIR:
return new UnauthorizedAccessException(errorDescription, unixIoException);
case Errno.EFAULT:
return new NullReferenceException(errorDescription, unixIoException);
case Errno.ENOTDIR:
return new DirectoryNotFoundException(errorDescription, unixIoException);
case Errno.ERANGE:
return new ArgumentOutOfRangeException(errorDescription);
case Errno.ENAMETOOLONG:
return new PathTooLongException(errorDescription, unixIoException);
case Errno.EOVERFLOW:
return new OverflowException(errorDescription, unixIoException);
default:
return unixIoException;
}
}
}
}