Skip to content

Commit

Permalink
Double encoding detection #471 #600 #769 #954
Browse files Browse the repository at this point in the history
  • Loading branch information
emako committed Dec 24, 2024
1 parent 596ff6d commit 3680770
Showing 1 changed file with 57 additions and 2 deletions.
59 changes: 57 additions & 2 deletions QuickLook.Plugin/QuickLook.Plugin.TextViewer/TextViewerPanel.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
using QuickLook.Common.Plugin;
using System;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Threading.Tasks;
Expand Down Expand Up @@ -177,8 +178,8 @@ private void LoadFileAsync(string path)
var bufferCopy = buffer.ToArray();
buffer.Dispose();

var encoding = CharsetDetector.DetectFromBytes(bufferCopy).Detected?.Encoding ??
Encoding.Default;
var result = CharsetDetector.DetectFromBytes(bufferCopy);
var encoding = result.DoubleDetectFromResult(bufferCopy); // Fix issues

var doc = new TextDocument(encoding.GetString(bufferCopy));
doc.SetOwnerThread(Dispatcher.Thread);
Expand All @@ -199,3 +200,57 @@ private void LoadFileAsync(string path)
});
}
}

file static class DetectionExtensions
{
public static Encoding DoubleDetectFromResult(this DetectionResult result, byte[] buffer)
{
// Determine the highest confidence encoding, or fallback to ANSI
var encoding = result.Detected?.Encoding ?? Encoding.Default;

// When mixing encodings, one of the encodings may gain higher confidence
// In this case, we should return to encodings UTF8 / UTF32 / ANSI
// https://github.com/QL-Win/QuickLook/issues/769
if (encoding != Encoding.UTF8 && encoding != Encoding.UTF32 && encoding != Encoding.Default)
{
if (result.Details.Any(detail => detail.Encoding == Encoding.UTF8))
{
encoding = Encoding.UTF8;
}
else if (result.Details.Any(detail => detail.Encoding == Encoding.UTF32))
{
encoding = Encoding.UTF32;
}
else if (result.Details.Any(detail => detail.Encoding == Encoding.Default))
{
encoding = Encoding.Default;
}
}

// When the text is too short and lacks a BOM
// In this case, we should fallback to an encoding if it is not recognized as UTF8 / UTF32 / ANSI
// https://github.com/QL-Win/QuickLook/issues/471
// https://github.com/QL-Win/QuickLook/issues/600
// https://github.com/QL-Win/QuickLook/issues/954
if (buffer.Length <= 50)
{
if (encoding != Encoding.UTF8 && encoding != Encoding.UTF32 && encoding != Encoding.Default)
{
if (!Encoding.UTF8.GetString(buffer).Contains("\uFFFD"))
{
encoding = Encoding.UTF8;
}
else if (!Encoding.UTF32.GetString(buffer).Contains("\uFFFD"))
{
encoding = Encoding.UTF32;
}
else if (!Encoding.Default.GetString(buffer).Contains("\uFFFD"))
{
encoding = Encoding.Default;
}
}
}

return encoding;
}
}

0 comments on commit 3680770

Please sign in to comment.