diff --git a/Analyzer.Tests/FileDetectionTests.cs b/Analyzer.Tests/FileDetectionTests.cs new file mode 100644 index 0000000..3d8c8bf --- /dev/null +++ b/Analyzer.Tests/FileDetectionTests.cs @@ -0,0 +1,302 @@ +using System; +using System.IO; +using NUnit.Framework; +using UnityDataTools.Analyzer.Util; +using UnityDataTools.FileSystem; + +namespace UnityDataTools.Analyzer.Tests; + +/// +/// Tests for file format detection utilities (ArchiveDetector and SerializedFileDetector). +/// +public class FileDetectionTests +{ + private string m_TestDataPath; + + [OneTimeSetUp] + public void OneTimeSetUp() + { + m_TestDataPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Data"); + UnityFileSystem.Init(); + } + + [OneTimeTearDown] + public void OneTimeTearDown() + { + UnityFileSystem.Cleanup(); + } + + #region SerializedFile Detection Tests + + [Test] + public void TryDetectSerializedFile_ValidPlayerDataFile_ReturnsTrue() + { + var testFile = Path.Combine(m_TestDataPath, "PlayerData", "2022.1.20f1", "level0"); + + bool result = SerializedFileDetector.TryDetectSerializedFile(testFile, out var info); + + Assert.IsTrue(result, "level0 should be detected as a valid SerializedFile"); + Assert.IsNotNull(info); + + // Verify exact values from the level0 file header + Assert.That(info.Version, Is.EqualTo(22u), "Version should be 22"); + Assert.That(info.FileSize, Is.EqualTo(31988UL), "FileSize should be 31988"); + Assert.That(info.MetadataSize, Is.EqualTo(24580UL), "MetadataSize should be 24580"); + Assert.That(info.DataOffset, Is.EqualTo(24640UL), "DataOffset should be 24640"); + Assert.That(info.Endianness, Is.EqualTo((byte)0), "Endianness should be 0 (LittleEndian)"); + Assert.IsFalse(info.IsLegacyFormat, "Version 22 uses modern format (64-bit header)"); + } + + [Test] + public void TryDetectSerializedFile_SerializedFileInsideArchive_ReturnsTrue() + { + // This tests a serialized file extracted from the alienprefab archive + // The file was originally at CAB-c5053efeda8860d7e7b7ce4b4c66705b inside the archive + var testFile = Path.Combine(m_TestDataPath, "LegacyFormats", "CAB-c5053efeda8860d7e7b7ce4b4c66705b"); + + bool result = SerializedFileDetector.TryDetectSerializedFile(testFile, out var info); + + Assert.IsTrue(result, "CAB-c5053efeda8860d7e7b7ce4b4c66705b should be detected as a valid SerializedFile"); + Assert.IsNotNull(info); + + // Verify exact values from the CAB file header + Assert.That(info.Version, Is.EqualTo(17u), "Version should be 17"); + Assert.That(info.FileSize, Is.EqualTo(595380UL), "FileSize should be 595380"); + Assert.That(info.MetadataSize, Is.EqualTo(61328UL), "MetadataSize should be 61328"); + Assert.That(info.DataOffset, Is.EqualTo(61360UL), "DataOffset should be 61360"); + Assert.That(info.Endianness, Is.EqualTo((byte)0), "Endianness should be 0 (LittleEndian)"); + Assert.IsTrue(info.IsLegacyFormat, "Version 17 uses legacy format (32-bit header)"); + } + + [Test] + public void TryDetectSerializedFile_JsonFile_ReturnsFalse() + { + var testFiles = Directory.GetFiles(Path.Combine(m_TestDataPath, "AddressableBuildLayouts"), "*.json"); + Assert.Greater(testFiles.Length, 0, "Should have at least one JSON test file"); + + foreach (var testFile in testFiles) + { + bool result = SerializedFileDetector.TryDetectSerializedFile(testFile, out var info); + + Assert.IsFalse(result, $"{Path.GetFileName(testFile)} should not be detected as a SerializedFile"); + Assert.IsNull(info, "Info should be null for non-SerializedFile"); + } + } + + [Test] + public void TryDetectSerializedFile_TextFile_ReturnsFalse() + { + var testFile = Path.Combine(m_TestDataPath, "PlayerNoTypeTree", "README.md"); + + bool result = SerializedFileDetector.TryDetectSerializedFile(testFile, out var info); + + Assert.IsFalse(result, "README.md should not be detected as a SerializedFile"); + Assert.IsNull(info); + } + + [Test] + public void TryDetectSerializedFile_EmptyFile_ReturnsFalse() + { + // Create a temporary empty file + var tempFile = Path.GetTempFileName(); + try + { + bool result = SerializedFileDetector.TryDetectSerializedFile(tempFile, out var info); + + Assert.IsFalse(result, "Empty file should not be detected as a SerializedFile"); + Assert.IsNull(info); + } + finally + { + File.Delete(tempFile); + } + } + + [Test] + public void TryDetectSerializedFile_TruncatedHeader_ReturnsFalse() + { + // Create a temporary file with only partial header (10 bytes) + var tempFile = Path.GetTempFileName(); + try + { + File.WriteAllBytes(tempFile, new byte[10]); // Less than minimum header size (20 bytes) + + bool result = SerializedFileDetector.TryDetectSerializedFile(tempFile, out var info); + + Assert.IsFalse(result, "Truncated file should not be detected as a SerializedFile"); + Assert.IsNull(info); + } + finally + { + File.Delete(tempFile); + } + } + + [Test] + public void TryDetectSerializedFile_RandomBytes_ReturnsFalse() + { + // Create a temporary file with random bytes + var tempFile = Path.GetTempFileName(); + try + { + var random = new Random(12345); // Fixed seed for reproducibility + byte[] randomData = new byte[100]; + random.NextBytes(randomData); + File.WriteAllBytes(tempFile, randomData); + + bool result = SerializedFileDetector.TryDetectSerializedFile(tempFile, out var info); + + Assert.IsFalse(result, "Random bytes should not be detected as a SerializedFile"); + Assert.IsNull(info); + } + finally + { + File.Delete(tempFile); + } + } + + [Test] + public void TryDetectSerializedFile_NonExistentFile_ReturnsFalse() + { + var nonExistentFile = Path.Combine(m_TestDataPath, "ThisFileDoesNotExist.xyz"); + + bool result = SerializedFileDetector.TryDetectSerializedFile(nonExistentFile, out var info); + + Assert.IsFalse(result, "Non-existent file should not be detected as a SerializedFile"); + Assert.IsNull(info); + } + + #endregion + + #region YAML SerializedFile Detection Tests + + [Test] + public void IsYamlSerializedFile_ValidYamlAsset_ReturnsTrue() + { + var testFile = Path.Combine(m_TestDataPath, "YamlFormat.asset"); + + bool result = YamlSerializedFileDetector.IsYamlSerializedFile(testFile); + + Assert.IsTrue(result, "YamlFormat.asset should be detected as a YAML SerializedFile"); + } + + [Test] + public void IsYamlSerializedFile_BinarySerializedFile_ReturnsFalse() + { + var testFile = Path.Combine(m_TestDataPath, "PlayerData", "2022.1.20f1", "level0"); + + bool result = YamlSerializedFileDetector.IsYamlSerializedFile(testFile); + + Assert.IsFalse(result, "Binary SerializedFile should not be detected as YAML"); + } + + [Test] + public void IsYamlSerializedFile_Archive_ReturnsFalse() + { + var testFile = Path.Combine(m_TestDataPath, "AssetBundles", "2022.1.20f1", "assetbundle"); + + bool result = YamlSerializedFileDetector.IsYamlSerializedFile(testFile); + + Assert.IsFalse(result, "AssetBundle should not be detected as YAML"); + } + + [Test] + public void IsYamlSerializedFile_JsonFile_ReturnsFalse() + { + var testFiles = Directory.GetFiles(Path.Combine(m_TestDataPath, "AddressableBuildLayouts"), "*.json"); + Assert.Greater(testFiles.Length, 0, "Should have at least one JSON test file"); + + foreach (var testFile in testFiles) + { + bool result = YamlSerializedFileDetector.IsYamlSerializedFile(testFile); + + Assert.IsFalse(result, $"JSON file should not be detected as YAML SerializedFile: {Path.GetFileName(testFile)}"); + } + } + + [Test] + public void IsYamlSerializedFile_NonExistentFile_ReturnsFalse() + { + var nonExistentFile = Path.Combine(m_TestDataPath, "ThisFileDoesNotExist.asset"); + + bool result = YamlSerializedFileDetector.IsYamlSerializedFile(nonExistentFile); + + Assert.IsFalse(result, "Non-existent file should not be detected as YAML"); + } + + #endregion + + #region Archive Detection Tests + + [Test] + public void IsUnityArchive_ValidAssetBundle_ReturnsTrue() + { + var testFile = Path.Combine(m_TestDataPath, "AssetBundles", "2022.1.20f1", "assetbundle"); + + bool result = ArchiveDetector.IsUnityArchive(testFile); + + Assert.IsTrue(result, "assetbundle should be detected as a Unity Archive"); + } + + [Test] + public void IsUnityArchive_OldFormatArchive_ReturnsTrue() + { + var testFile = Path.Combine(m_TestDataPath, "LegacyFormats", "alienprefab"); + + bool result = ArchiveDetector.IsUnityArchive(testFile); + + Assert.IsTrue(result, "alienprefab should be detected as a Unity Archive"); + } + + [Test] + public void IsUnityArchive_SerializedFile_ReturnsFalse() + { + var testFile = Path.Combine(m_TestDataPath, "PlayerData", "2022.1.20f1", "level0"); + + bool result = ArchiveDetector.IsUnityArchive(testFile); + + Assert.IsFalse(result, "level0 (SerializedFile) should not be detected as an archive"); + } + + [Test] + public void IsUnityArchive_JsonFile_ReturnsFalse() + { + var testFiles = Directory.GetFiles(Path.Combine(m_TestDataPath, "AddressableBuildLayouts"), "*.json"); + Assert.Greater(testFiles.Length, 0, "Should have at least one JSON test file"); + + foreach (var testFile in testFiles) + { + bool result = ArchiveDetector.IsUnityArchive(testFile); + + Assert.IsFalse(result, $"{Path.GetFileName(testFile)} should not be detected as an archive"); + } + } + + [Test] + public void IsUnityArchive_EmptyFile_ReturnsFalse() + { + var tempFile = Path.GetTempFileName(); + try + { + bool result = ArchiveDetector.IsUnityArchive(tempFile); + + Assert.IsFalse(result, "Empty file should not be detected as an archive"); + } + finally + { + File.Delete(tempFile); + } + } + + [Test] + public void IsUnityArchive_NonExistentFile_ReturnsFalse() + { + var nonExistentFile = Path.Combine(m_TestDataPath, "ThisFileDoesNotExist.xyz"); + + bool result = ArchiveDetector.IsUnityArchive(nonExistentFile); + + Assert.IsFalse(result, "Non-existent file should not be detected as an archive"); + } + + #endregion +} diff --git a/Analyzer/SQLite/Parsers/SerializedFileParser.cs b/Analyzer/SQLite/Parsers/SerializedFileParser.cs index 5aa487e..853f056 100644 --- a/Analyzer/SQLite/Parsers/SerializedFileParser.cs +++ b/Analyzer/SQLite/Parsers/SerializedFileParser.cs @@ -4,6 +4,7 @@ using Microsoft.Data.Sqlite; using UnityDataTools.Analyzer.SQLite.Handlers; using UnityDataTools.Analyzer.SQLite.Writers; +using UnityDataTools.Analyzer.Util; using UnityDataTools.FileSystem; namespace UnityDataTools.Analyzer.SQLite.Parsers @@ -17,7 +18,14 @@ public class SerializedFileParser : ISQLiteFileParser public bool CanParse(string filename) { - return ShouldIgnoreFile(filename) == false; + // First check if the file is in the ignore list (by extension or filename) + if (ShouldIgnoreFile(filename)) + return false; + + // Then validate that it's actually a Unity file by checking its format + // This prevents ugly exceptions when processing non-Unity files + return ArchiveDetector.IsUnityArchive(filename) + || SerializedFileDetector.TryDetectSerializedFile(filename, out _); } @@ -40,12 +48,12 @@ public void Parse(string filename) bool ShouldIgnoreFile(string file) { - // Unfortunately there is no standard extension for AssetBundles, and SerializedFiles often have no extension at all. - // Also there is also no distinctive signature at the start of a SerializedFile to immediately recognize it based on its first bytes. - // This makes it difficult to use the "--search-pattern" argument to only pick those files. - - // Hence to reduce noise in UnityDataTool output we filter out files that we have a high confidence are - // NOT SerializedFiles or Unity Archives. + // Filter out common non-Unity files by extension or filename. + // This is a fast initial filter before we perform format detection. + // + // Note: AssetBundles have no standard extension, and SerializedFiles often have no extension at all. + // Format detection (via ArchiveDetector and SerializedFileDetector) is performed after this filter + // to definitively identify Unity files. string fileName = Path.GetFileName(file); string extension = Path.GetExtension(file); @@ -69,7 +77,7 @@ bool ShouldIgnoreFile(string file) void ProcessFile(string file, string rootDirectory) { - if (IsUnityArchive(file)) + if (ArchiveDetector.IsUnityArchive(file)) { bool archiveHadErrors = false; using (UnityArchive archive = UnityFileSystem.MountArchive(file, "archive:" + Path.DirectorySeparatorChar)) @@ -122,45 +130,12 @@ void ProcessFile(string file, string rootDirectory) } else { - // This isn't a Unity Archive file. Try to open it as a SerializedFile. - // Unfortunately there is no standard file extension, or clear signature at the start of the file, - // to test if it truly is a SerializedFile. So this will process files that are clearly not unity build files, - // and there is a chance for crashes and freezes if the parser misinterprets the file content. + // This isn't a Unity Archive file, so process it as a SerializedFile. + // Note: The file has already been validated in CanParse() via SerializedFileDetector, + // so we're confident it's a valid SerializedFile at this point. var relativePath = Path.GetRelativePath(rootDirectory, file); m_Writer.WriteSerializedFile(relativePath, file, Path.GetDirectoryName(file)); } } - - private static bool IsUnityArchive(string filePath) - { - // Check whether a file is a Unity Archive (AssetBundle) by looking for known signatures at the start of the file. - // "UnifyFS" is the current signature, but some older formats of the file are still supported - string[] signatures = { "UnityFS", "UnityWeb", "UnityRaw", "UnityArchive" }; - int maxLen = 12; // "UnityArchive".Length - byte[] buffer = new byte[maxLen]; - - using (var fs = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read)) - { - int read = fs.Read(buffer, 0, buffer.Length); - foreach (var sig in signatures) - { - if (read >= sig.Length) - { - bool match = true; - for (int i = 0; i < sig.Length; ++i) - { - if (buffer[i] != sig[i]) - { - match = false; - break; - } - } - if (match) - return true; - } - } - return false; - } - } } } diff --git a/Analyzer/Util/ArchiveDetector.cs b/Analyzer/Util/ArchiveDetector.cs new file mode 100644 index 0000000..7ce6914 --- /dev/null +++ b/Analyzer/Util/ArchiveDetector.cs @@ -0,0 +1,63 @@ +using System; +using System.IO; + +namespace UnityDataTools.Analyzer.Util; + +/// +/// Utility for detecting Unity Archive (AssetBundle) files by reading their signature. +/// +public static class ArchiveDetector +{ + private static readonly string[] Signatures = { "UnityFS", "UnityWeb", "UnityRaw", "UnityArchive" }; + private const int MaxSignatureLength = 12; // "UnityArchive".Length + + /// + /// Checks if a file is a Unity Archive (AssetBundle) by reading its signature. + /// Supports UnityFS, UnityWeb, UnityRaw, and UnityArchive formats. + /// + /// Path to the file to check + /// True if file appears to be a Unity Archive, false otherwise + public static bool IsUnityArchive(string filePath) + { + if (!File.Exists(filePath)) + return false; + + try + { + using var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read); + + // Read the first bytes to check for known signatures + byte[] buffer = new byte[MaxSignatureLength]; + int bytesRead = stream.Read(buffer, 0, buffer.Length); + + if (bytesRead < Signatures[0].Length) // "UnityFS" is the shortest at 7 bytes + return false; + + // Check against all known archive signatures + foreach (var signature in Signatures) + { + if (bytesRead >= signature.Length) + { + bool match = true; + for (int i = 0; i < signature.Length; i++) + { + if (buffer[i] != signature[i]) + { + match = false; + break; + } + } + if (match) + return true; + } + } + + return false; + } + catch + { + // If we can't read the file, it's not a valid archive + return false; + } + } +} diff --git a/Analyzer/Util/SerializedFileDetector.cs b/Analyzer/Util/SerializedFileDetector.cs new file mode 100644 index 0000000..a9ca991 --- /dev/null +++ b/Analyzer/Util/SerializedFileDetector.cs @@ -0,0 +1,325 @@ +using System; +using System.IO; + +namespace UnityDataTools.Analyzer.Util; + +/// +/// Information extracted from a Unity SerializedFile header. +/// +public class SerializedFileInfo +{ + public uint Version { get; set; } + public ulong FileSize { get; set; } + public ulong MetadataSize { get; set; } + public ulong DataOffset { get; set; } + public byte Endianness { get; set; } + public bool IsLegacyFormat { get; set; } +} + +/// +/// Utility for detecting Unity SerializedFile format by reading and validating the file header. +/// +/// Unity SerializedFiles have evolved through several format versions: +/// +/// Version < 9: +/// - 20-byte header (SerializedFileHeader32) with 32-bit offsets/sizes +/// - Layout: [header][data][metadata] +/// - Endianness byte stored at END of file, just before metadata +/// +/// Version 9-21: +/// - 20-byte header (SerializedFileHeader32) with 32-bit offsets/sizes +/// - Layout: [header][metadata][data] +/// - Endianness byte at offset 16 in header +/// - Limited to 4GB file sizes +/// +/// Version >= 22 (kLargeFilesSupport): +/// - 48-byte header (SerializedFileHeader) with 64-bit offsets/sizes +/// - Layout: [header][metadata][data] +/// - Endianness byte at offset 40 in header +/// - Supports files larger than 4GB +/// +/// Important: The header itself is always stored in big-endian format on disk, +/// but the m_Endianness byte indicates the endianness of the actual data section. +/// +public static class SerializedFileDetector +{ + // Version boundaries for format changes + // NOTE: This version is so old that it is extremely unlikely it will work with modern versions of Unity, + // we handle it just for the purpose of trying to report accurate information about the file. + private const uint NewLayoutVersion = 9; // Changed from [header][data][metadata] to [header][metadata][data] + + private const uint LargeFilesSupportVersion = 22; // Changed to 64-bit header + + // Reasonable version range for SerializedFiles + // Unity versions currently use values in the 20s-30s range + private const uint MinVersion = 1; + private const uint MaxVersion = 50; + + // Endianness values (only little-endian is supported in Unity 2023+) + private const byte LittleEndian = 0; + private const byte BigEndian = 1; + + // Header sizes + private const int LegacyHeaderSize = 20; // SerializedFileHeader32 + private const int ModernHeaderSize = 48; // SerializedFileHeader + + /// + /// Attempts to detect if a file is a Unity SerializedFile by reading and validating its header. + /// Returns false immediately if the file doesn't match the expected format. + /// + /// Path to the file to check + /// If successful, contains header information + /// True if file appears to be a valid SerializedFile, false otherwise + public static bool TryDetectSerializedFile(string filePath, out SerializedFileInfo info) + { + info = null; + + if (!File.Exists(filePath)) + return false; + + try + { + using var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read); + long fileLength = stream.Length; + + // Quick rejection: file must be at least large enough for the legacy header + if (fileLength < LegacyHeaderSize) + return false; + + // Read enough bytes to cover a modern header (48 bytes) + // We'll determine which format to parse based on the version field + byte[] headerBytes = new byte[ModernHeaderSize]; + int bytesRead = stream.Read(headerBytes, 0, headerBytes.Length); + + if (bytesRead < LegacyHeaderSize) + return false; + + // ============================================================ + // STEP 1: Read version to determine header format + // ============================================================ + + // The version field is always at offset 8 in both header formats. + // The header itself is always stored in big-endian format on disk. + // On little-endian platforms (Windows, etc.), we need to swap the header fields. + // + // We try both interpretations to determine if swapping is needed: + uint versionLE = BitConverter.ToUInt32(headerBytes, 8); + uint versionBE = SwapUInt32(versionLE); + + // Determine which interpretation gives us a valid version number + uint version; + bool needsSwap; // Whether header fields need byte swapping (expected to be true when running on most modern systems, which are little-endian) + + if (versionLE >= MinVersion && versionLE <= MaxVersion) + { + // Reading as little-endian gives valid version (header is in little-endian format) + version = versionLE; + needsSwap = false; + } + else if (versionBE >= MinVersion && versionBE <= MaxVersion) + { + // Reading as big-endian gives valid version (header is in big-endian format) + version = versionBE; + needsSwap = true; + } + else + { + // Neither interpretation gives a valid version + return false; + } + + // Determine header format based on version + bool isLegacyFormat = version < LargeFilesSupportVersion; + + // ============================================================ + // STEP 2: Read endianness byte + // ============================================================ + // + // The m_Endianness byte indicates the endianness of the DATA section + // (not the header, which is always big-endian on disk). + // Location depends on version: + // - Version < 9: At end of file (before metadata) - we skip reading it for detection + // - Version 9-21: At offset 16 in the 20-byte header + // - Version >= 22: At offset 40 in the 48-byte header + // + // The endianness byte is never swapped (it's a single byte). + + byte endianness; + + if (version < NewLayoutVersion) + { + // Version < 9: Endianness is at the end of the file + // For detection purposes, we infer it from the header byte order + // (though this is technically the header's endianness, not the data's) + endianness = needsSwap ? BigEndian : LittleEndian; + } + else if (isLegacyFormat) + { + // Version 9-21: Endianness is at offset 16 in SerializedFileHeader32 + if (bytesRead >= 17) + { + endianness = headerBytes[16]; + + // Validate endianness value + if (endianness != LittleEndian && endianness != BigEndian) + return false; + } + else + { + return false; // File truncated + } + } + else + { + // Version >= 22: Endianness is at offset 40 in SerializedFileHeader + if (bytesRead >= 41) + { + endianness = headerBytes[40]; + + // Validate endianness value + if (endianness != LittleEndian && endianness != BigEndian) + return false; + } + else + { + return false; // File truncated + } + } + + // ============================================================ + // STEP 3: Parse the appropriate header format + // ============================================================ + + ulong metadataSize, fileSize, dataOffset; + + if (isLegacyFormat) + { + // SerializedFileHeader32 Layout (20 bytes total): + // Offset 0-3: UInt32 m_MetadataSize + // Offset 4-7: UInt32 m_FileSize + // Offset 8-11: UInt32 m_Version + // Offset 12-15: UInt32 m_DataOffset + // Offset 16: UInt8 m_Endianness (only present for version >= 9) + // Offset 17-19: UInt8 m_Reserved[3] + // + // Note: For version < 9, m_Endianness is NOT in the header. + // It's stored at the end of the file, just before metadata begins. + + uint metadataSize32 = ReadUInt32(headerBytes, 0, needsSwap); + uint fileSize32 = ReadUInt32(headerBytes, 4, needsSwap); + uint dataOffset32 = ReadUInt32(headerBytes, 12, needsSwap); + + // Convert to 64-bit for consistency + metadataSize = metadataSize32; + fileSize = fileSize32; + dataOffset = dataOffset32; + + // Special case: Legacy format used UInt32.MaxValue to indicate "unknown" file size + if (fileSize32 == uint.MaxValue) + { + fileSize = ulong.MaxValue; + } + } + else + { + // SerializedFileHeader Layout (48 bytes total): + // Offset 0-7: UInt8[8] m_Legacy (unused, allows struct alignment with SerializedFileHeader32) + // Offset 8-11: UInt32 m_Version + // Offset 12-15: UInt8[4] m_Reserved0 (explicit padding) + // Offset 16-23: UInt64 m_MetadataSize + // Offset 24-31: UInt64 m_FileSize + // Offset 32-39: UInt64 m_DataOffset + // Offset 40: UInt8 m_Endianness + // Offset 41-47: UInt8[7] m_Reserved1 + + metadataSize = ReadUInt64(headerBytes, 16, needsSwap); + fileSize = ReadUInt64(headerBytes, 24, needsSwap); + dataOffset = ReadUInt64(headerBytes, 32, needsSwap); + } + + // ============================================================ + // STEP 4: Validate header consistency + // ============================================================ + + // MetadataSize must not be the sentinel value (indicates corruption) + if (metadataSize == ulong.MaxValue) + return false; + + // DataOffset must be within the file size + if (fileSize != ulong.MaxValue && dataOffset > fileSize) + return false; + + // FileSize should roughly match actual file size + // Allow some tolerance for "stream files" which can have padding + if (fileSize != ulong.MaxValue) + { + // File size should not exceed actual file size by more than 1KB (arbitrary tolerance) + if (fileSize > (ulong)fileLength + 1024) + return false; + } + + // MetadataSize should be reasonable (not larger than the file itself) + if (metadataSize > (ulong)fileLength) + return false; + + // ============================================================ + // STEP 5: Populate and return info + // ============================================================ + + info = new SerializedFileInfo + { + Version = version, + FileSize = fileSize, + MetadataSize = metadataSize, + DataOffset = dataOffset, + Endianness = endianness, + IsLegacyFormat = isLegacyFormat + }; + + return true; + } + catch + { + // Any exception during reading/parsing means this isn't a valid SerializedFile + return false; + } + } + + /// + /// Reads a UInt32 from a byte array at the specified offset, optionally swapping endianness. + /// + private static uint ReadUInt32(byte[] buffer, int offset, bool swap) + { + uint value = BitConverter.ToUInt32(buffer, offset); + return swap ? SwapUInt32(value) : value; + } + + /// + /// Reads a UInt64 from a byte array at the specified offset, optionally swapping endianness. + /// + private static ulong ReadUInt64(byte[] buffer, int offset, bool swap) + { + ulong value = BitConverter.ToUInt64(buffer, offset); + return swap ? SwapUInt64(value) : value; + } + + private static uint SwapUInt32(uint value) + { + return ((value & 0x000000FFU) << 24) | + ((value & 0x0000FF00U) << 8) | + ((value & 0x00FF0000U) >> 8) | + ((value & 0xFF000000U) >> 24); + } + + private static ulong SwapUInt64(ulong value) + { + return ((value & 0x00000000000000FFUL) << 56) | + ((value & 0x000000000000FF00UL) << 40) | + ((value & 0x0000000000FF0000UL) << 24) | + ((value & 0x00000000FF000000UL) << 8) | + ((value & 0x000000FF00000000UL) >> 8) | + ((value & 0x0000FF0000000000UL) >> 24) | + ((value & 0x00FF000000000000UL) >> 40) | + ((value & 0xFF00000000000000UL) >> 56); + } +} diff --git a/Analyzer/Util/YamlSerializedFileDetector.cs b/Analyzer/Util/YamlSerializedFileDetector.cs new file mode 100644 index 0000000..c4bd011 --- /dev/null +++ b/Analyzer/Util/YamlSerializedFileDetector.cs @@ -0,0 +1,78 @@ +using System; +using System.IO; +using System.Text; + +namespace UnityDataTools.Analyzer.Util; + +/// +/// Utility for detecting YAML-format Unity SerializedFiles. +/// +/// Unity SerializedFiles can be stored in two formats: +/// 1. Binary format (produced by builds - read by Unity Runtime, also used for imported artifacts etc) - detected by SerializedFileDetector +/// 2. YAML format (text format used in Editor for .asset, .prefab, .unity files) - detected by this class +/// +/// YAML SerializedFiles begin with the magic string "%YAML 1.1", optionally preceded by +/// a UTF-8 BOM (byte order mark: 0xEF 0xBB 0xBF). +/// +public static class YamlSerializedFileDetector +{ + private const string UnityTextMagicString = "%YAML 1.1"; + private static readonly byte[] Utf8Bom = new byte[] { 0xEF, 0xBB, 0xBF }; + + public static bool IsYamlSerializedFile(string filePath) + { + if (!File.Exists(filePath)) + return false; + + try + { + using var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read, FileShare.Read); + + // Unity checks for UTF-8 BOM (3 bytes) + magic string (9 bytes) = 12 bytes total + const int bomLength = 3; + int magicLength = UnityTextMagicString.Length; + int bufferSize = bomLength + magicLength; + + if (stream.Length < magicLength) + return false; + + byte[] buffer = new byte[bufferSize]; + int bytesRead = stream.Read(buffer, 0, Math.Min(bufferSize, (int)stream.Length)); + + if (bytesRead < magicLength) + return false; + + int offset = 0; + if (bytesRead >= bomLength && HasUtf8Bom(buffer)) + { + offset = bomLength; + } + + // Check for magic string after BOM (if present) + if (bytesRead - offset < magicLength) + return false; + + string fileStart = Encoding.ASCII.GetString(buffer, offset, magicLength); + return fileStart == UnityTextMagicString; + } + catch + { + // Any exception during file reading means this isn't a valid YAML file + return false; + } + } + + private static bool HasUtf8Bom(byte[] buffer) + { + if (buffer.Length < Utf8Bom.Length) + return false; + + for (int i = 0; i < Utf8Bom.Length; i++) + { + if (buffer[i] != Utf8Bom[i]) + return false; + } + + return true; + } +} diff --git a/Documentation/command-serialized-file.md b/Documentation/command-serialized-file.md index ebae339..d8d7cc4 100644 --- a/Documentation/command-serialized-file.md +++ b/Documentation/command-serialized-file.md @@ -2,12 +2,25 @@ The `serialized-file` command (alias: `sf`) provides utilities for quickly inspecting SerializedFile metadata without performing a full analysis. +This exposes information about the Binary SerializedFile format. This format has evolved over time, but all recent versions have +* a small header section (exposed by the `header` subcommand) +* a metadata section which contains summary of the data + * Unity Version and target platform + * typetree information + * the list of objects and offsets + * external references +* the data section which contains the Unity objects in serialized form + +The 'externalrefs' and 'objectlist' sub-commands expose information from the metadata section. +The `dump` command can be used to view the serialized objects. + ## Sub-Commands | Sub-Command | Description | |-------------|-------------| | [`externalrefs`](#externalrefs) | List external file references | | [`objectlist`](#objectlist) | List all objects in the file | +| [`header`](#header) | Show SerializedFile header information | --- @@ -128,6 +141,69 @@ UnityDataTool serialized-file objectlist level0 --format json --- +## header + +Shows the SerializedFile header information. This is useful for testing whether a file is a valid SerializedFile and for inspecting the version and structure. + +### Quick Reference + +``` +UnityDataTool serialized-file header [options] +UnityDataTool sf header [options] +``` + +| Option | Description | Default | +|--------|-------------|---------| +| `` | Path to the SerializedFile | *(required)* | +| `-f, --format ` | Output format: `Text` or `Json` | `Text` | + +### Example - Text Output + +```bash +UnityDataTool sf header sharedassets0.assets +``` + +**Output:** +``` +Version 22 +Format Modern (64-bit) +File Size 1,234,567 bytes +Metadata Size 45,678 bytes +Data Offset 45,728 +Endianness Little Endian +``` + +### Example - JSON Output + +```bash +UnityDataTool serialized-file header level0 --format json +``` + +**Output:** +```json +{ + "version": 22, + "format": "Modern (64-bit)", + "fileSize": 31988, + "metadataSize": 24580, + "dataOffset": 24640, + "endianness": "Little Endian" +} +``` + +### Header Fields + +| Field | Description | +|-------|-------------| +| **Version** | SerializedFile format version. Modern Unity (2019+) uses version 22+. | +| **Format** | Header format type: "Legacy (32-bit)" for versions < 22, or "Modern (64-bit)" for versions ≥ 22. Modern format supports files larger than 4GB. | +| **File Size** | Total size of the SerializedFile in bytes. Padding might make the actual file size slightly larger. | +| **Metadata Size** | Size of the metadata section containing type information and object indices. | +| **Data Offset** | Byte offset where the object data section begins in the file. | +| **Endianness** | Byte order of the data in the file: "Little Endian" (x86, most platforms) or "Big Endian" (older console platforms). | + +--- + ## Use Cases ### Quick File Inspection @@ -135,6 +211,9 @@ UnityDataTool serialized-file objectlist level0 --format json Use `serialized-file` when you need quick information about a SerializedFile without generating a full SQLite database: ```bash +# Check file format and version +UnityDataTool sf header level0 + # Check what objects are in a file UnityDataTool sf objectlist sharedassets0.assets diff --git a/TestCommon/Data/LegacyFormats/CAB-c5053efeda8860d7e7b7ce4b4c66705b b/TestCommon/Data/LegacyFormats/CAB-c5053efeda8860d7e7b7ce4b4c66705b new file mode 100644 index 0000000..ce3427d Binary files /dev/null and b/TestCommon/Data/LegacyFormats/CAB-c5053efeda8860d7e7b7ce4b4c66705b differ diff --git a/TestCommon/Data/LegacyFormats/alienprefab b/TestCommon/Data/LegacyFormats/alienprefab new file mode 100644 index 0000000..4ff2926 Binary files /dev/null and b/TestCommon/Data/LegacyFormats/alienprefab differ diff --git a/TestCommon/Data/YamlFormat.asset b/TestCommon/Data/YamlFormat.asset new file mode 100644 index 0000000..4d29b23 --- /dev/null +++ b/TestCommon/Data/YamlFormat.asset @@ -0,0 +1,15 @@ +%YAML 1.1 +%TAG !u! tag:unity3d.com,2011: +--- !u!114 &11400000 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 0} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 11500000, guid: 070349760c9dbfd4e8318d73401cca23, type: 3} + m_Name: SimpleScriptableObjectAsset1 + m_EditorClassIdentifier: + Data: 67 diff --git a/UnityDataTool.Tests/SerializedFileCommandTests.cs b/UnityDataTool.Tests/SerializedFileCommandTests.cs index c2b9591..c4ddeb5 100644 --- a/UnityDataTool.Tests/SerializedFileCommandTests.cs +++ b/UnityDataTool.Tests/SerializedFileCommandTests.cs @@ -295,6 +295,123 @@ public async Task ObjectList_SharedAssets_ContainsExpectedTypes() #endregion + #region Header Tests + + [Test] + public async Task Header_TextFormat_OutputsCorrectly() + { + var path = Path.Combine(TestContext.CurrentContext.TestDirectory, "Data", "PlayerNoTypeTree", "sharedassets0.assets"); + using var sw = new StringWriter(); + var currentOut = Console.Out; + try + { + Console.SetOut(sw); + + Assert.AreEqual(0, await Program.Main(new string[] { "serialized-file", "header", path })); + + var output = sw.ToString(); + var lines = output.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries); + + // Should have header information lines + Assert.Greater(lines.Length, 0, "Expected header information"); + + // Check for expected fields + StringAssert.Contains("Version", output); + StringAssert.Contains("Format", output); + StringAssert.Contains("File Size", output); + StringAssert.Contains("Metadata Size", output); + StringAssert.Contains("Data Offset", output); + StringAssert.Contains("Endianness", output); + } + finally + { + Console.SetOut(currentOut); + } + } + + [Test] + public async Task Header_JsonFormat_OutputsValidJson() + { + var path = Path.Combine(TestContext.CurrentContext.TestDirectory, "Data", "BuildReports", "Player.buildreport"); + using var sw = new StringWriter(); + var currentOut = Console.Out; + try + { + Console.SetOut(sw); + + Assert.AreEqual(0, await Program.Main(new string[] { "serialized-file", "header", path, "-f", "json" })); + + var output = sw.ToString(); + + // Parse JSON to verify it's valid + var jsonDoc = JsonDocument.Parse(output); + var root = jsonDoc.RootElement; + + // Verify all expected properties are present + Assert.IsTrue(root.TryGetProperty("version", out _)); + Assert.IsTrue(root.TryGetProperty("format", out _)); + Assert.IsTrue(root.TryGetProperty("fileSize", out _)); + Assert.IsTrue(root.TryGetProperty("metadataSize", out _)); + Assert.IsTrue(root.TryGetProperty("dataOffset", out _)); + Assert.IsTrue(root.TryGetProperty("endianness", out _)); + + // Verify version is a number + var version = root.GetProperty("version").GetUInt32(); + Assert.Greater(version, 0u, "Version should be greater than 0"); + + // Verify format is a valid string + var format = root.GetProperty("format").GetString(); + Assert.IsTrue(format == "Legacy (32-bit)" || format == "Modern (64-bit)", + $"Format should be either Legacy or Modern, got: {format}"); + } + finally + { + Console.SetOut(currentOut); + } + } + + [Test] + public async Task Header_InvalidFile_ReturnsError() + { + var path = Path.Combine(m_TestDataFolder, "README.md"); + + var result = await Program.Main(new string[] { "serialized-file", "header", path }); + Assert.AreNotEqual(0, result, "Should return error code for invalid file"); + } + + [Test] + public async Task Header_ArchiveFile_ReturnsError() + { + var legacyDir = Path.Combine(TestContext.CurrentContext.TestDirectory, "Data", "LegacyFormats"); + var archivePath = Path.Combine(legacyDir, "alienprefab"); + + if (!File.Exists(archivePath)) + { + Assert.Ignore("alienprefab test file not found"); + return; + } + + using var sw = new StringWriter(); + var currentErr = Console.Error; + try + { + Console.SetError(sw); + + var result = await Program.Main(new string[] { "serialized-file", "header", archivePath }); + + Assert.AreNotEqual(0, result, "Should return error code for archive file"); + + var errorOutput = sw.ToString(); + StringAssert.Contains("Unity Archive", errorOutput, "Error message should mention Unity Archive"); + } + finally + { + Console.SetError(currentErr); + } + } + + #endregion + #region Cross-Validation with Analyze Command [Test] @@ -495,6 +612,147 @@ public async Task ErrorHandling_NonExistentFile_ReturnsError() Assert.AreNotEqual(0, result, "Should return error code for non-existent file"); } + [Test] + public async Task ErrorHandling_ArchiveFile_ReturnsHelpfulError() + { + // Use an AssetBundle from test data + var assetBundlesDir = Path.Combine(TestContext.CurrentContext.TestDirectory, "Data", "AssetBundles", "2022.1.20f1"); + + // Skip if the test data doesn't exist (CI environments might not have all test data) + if (!Directory.Exists(assetBundlesDir)) + { + Assert.Ignore("AssetBundle test data not found"); + return; + } + + var archiveFiles = Directory.GetFiles(assetBundlesDir, "*", SearchOption.TopDirectoryOnly); + if (archiveFiles.Length == 0) + { + Assert.Ignore("No AssetBundle test files found"); + return; + } + + var archivePath = archiveFiles[0]; // Use first archive file found + + using var sw = new StringWriter(); + var currentErr = Console.Error; + try + { + Console.SetError(sw); + + var result = await Program.Main(new string[] { "serialized-file", "objectlist", archivePath }); + + Assert.AreNotEqual(0, result, "Should return error code for archive file"); + + var errorOutput = sw.ToString(); + StringAssert.Contains("Unity Archive", errorOutput, "Error message should mention Unity Archive"); + StringAssert.Contains("archive extract", errorOutput, "Error message should suggest using archive extract command"); + } + finally + { + Console.SetError(currentErr); + } + } + + [Test] + public async Task ErrorHandling_InvalidFile_ShowsHelpfulMessage() + { + var path = Path.Combine(m_TestDataFolder, "README.md"); + + using var sw = new StringWriter(); + var currentErr = Console.Error; + try + { + Console.SetError(sw); + + var result = await Program.Main(new string[] { "serialized-file", "objectlist", path }); + + Assert.AreNotEqual(0, result, "Should return error code for invalid file"); + + var errorOutput = sw.ToString(); + StringAssert.Contains("not appear to be a valid Unity SerializedFile", errorOutput, + "Error message should explain that the file is not a valid SerializedFile"); + } + finally + { + Console.SetError(currentErr); + } + } + + [Test] + public async Task ErrorHandling_YamlFile_ReturnsHelpfulError() + { + var path = Path.Combine(TestContext.CurrentContext.TestDirectory, "Data", "YamlFormat.asset"); + + using var sw = new StringWriter(); + var currentErr = Console.Error; + try + { + Console.SetError(sw); + + var result = await Program.Main(new string[] { "serialized-file", "header", path }); + + Assert.AreNotEqual(0, result, "Should return error code for YAML file"); + + var errorOutput = sw.ToString(); + StringAssert.Contains("YAML-format SerializedFile", errorOutput, "Error message should mention YAML format"); + StringAssert.Contains("not supported", errorOutput, "Error message should explain YAML is not supported"); + StringAssert.Contains("binary-format", errorOutput, "Error message should mention binary format is supported"); + } + finally + { + Console.SetError(currentErr); + } + } + + [Test] + public async Task ErrorHandling_YamlFile_ExternalRefs_ReturnsHelpfulError() + { + var path = Path.Combine(TestContext.CurrentContext.TestDirectory, "Data", "YamlFormat.asset"); + + using var sw = new StringWriter(); + var currentErr = Console.Error; + try + { + Console.SetError(sw); + + var result = await Program.Main(new string[] { "serialized-file", "externalrefs", path }); + + Assert.AreNotEqual(0, result, "Should return error code for YAML file"); + + var errorOutput = sw.ToString(); + StringAssert.Contains("YAML-format SerializedFile", errorOutput, "Error message should mention YAML format"); + } + finally + { + Console.SetError(currentErr); + } + } + + [Test] + public async Task ErrorHandling_YamlFile_ObjectList_ReturnsHelpfulError() + { + var path = Path.Combine(TestContext.CurrentContext.TestDirectory, "Data", "YamlFormat.asset"); + + using var sw = new StringWriter(); + var currentErr = Console.Error; + try + { + Console.SetError(sw); + + var result = await Program.Main(new string[] { "sf", "objectlist", path }); + + Assert.AreNotEqual(0, result, "Should return error code for YAML file"); + + var errorOutput = sw.ToString(); + StringAssert.Contains("YAML-format SerializedFile", errorOutput, "Error message should mention YAML format"); + } + finally + { + Console.SetError(currentErr); + } + } + #endregion } diff --git a/UnityDataTool/Program.cs b/UnityDataTool/Program.cs index 59fae2d..ba6c512 100644 --- a/UnityDataTool/Program.cs +++ b/UnityDataTool/Program.cs @@ -154,10 +154,21 @@ public static async Task Main(string[] args) (FileInfo fi, OutputFormat f) => Task.FromResult(SerializedFileCommands.HandleObjectList(fi, f)), pathArg, fOpt); + var headerCommand = new Command("header", "Show SerializedFile header information.") + { + pathArg, + fOpt, + }; + + headerCommand.SetHandler( + (FileInfo fi, OutputFormat f) => Task.FromResult(SerializedFileCommands.HandleHeader(fi, f)), + pathArg, fOpt); + var serializedFileCommand = new Command("serialized-file", "Inspect a SerializedFile (scene, assets, etc.).") { externalRefsCommand, objectListCommand, + headerCommand, }; serializedFileCommand.AddAlias("sf"); diff --git a/UnityDataTool/SerializedFileCommands.cs b/UnityDataTool/SerializedFileCommands.cs index 56ca0ca..3098a8a 100644 --- a/UnityDataTool/SerializedFileCommands.cs +++ b/UnityDataTool/SerializedFileCommands.cs @@ -1,6 +1,7 @@ using System; using System.IO; using System.Text.Json; +using UnityDataTools.Analyzer.Util; using UnityDataTools.FileSystem; namespace UnityDataTools.UnityDataTool; @@ -9,46 +10,109 @@ public static class SerializedFileCommands { public static int HandleExternalRefs(FileInfo filename, OutputFormat format) { + if (!ValidateSerializedFile(filename.FullName, out _)) + return 1; + try { using var sf = UnityFileSystem.OpenSerializedFile(filename.FullName); - if (format == OutputFormat.Json) OutputExternalRefsJson(sf); else OutputExternalRefsText(sf); + return 0; } catch (Exception err) when (err is NotSupportedException || err is FileFormatException) { - Console.Error.WriteLine($"Error opening serialized file: {filename.FullName}"); + Console.Error.WriteLine($"Error opening SerializedFile: {filename.FullName}"); Console.Error.WriteLine(err.Message); return 1; } - - return 0; } public static int HandleObjectList(FileInfo filename, OutputFormat format) { + if (!ValidateSerializedFile(filename.FullName, out _)) + return 1; + try { using var sf = UnityFileSystem.OpenSerializedFile(filename.FullName); - if (format == OutputFormat.Json) OutputObjectListJson(sf); else OutputObjectListText(sf); + return 0; } catch (Exception err) when (err is NotSupportedException || err is FileFormatException) { - Console.Error.WriteLine($"Error opening serialized file: {filename.FullName}"); + Console.Error.WriteLine($"Error opening SerializedFile: {filename.FullName}"); Console.Error.WriteLine(err.Message); return 1; } + } + + public static int HandleHeader(FileInfo filename, OutputFormat format) + { + if (!ValidateSerializedFile(filename.FullName, out var fileInfo)) + return 1; + + if (format == OutputFormat.Json) + OutputHeaderJson(fileInfo); + else + OutputHeaderText(fileInfo); return 0; } + /// + /// Validates that a file is a SerializedFile and provides helpful error messages if not. + /// + /// Path to the file to validate + /// SerializedFile header information if valid, null otherwise + /// True if valid SerializedFile, false otherwise + private static bool ValidateSerializedFile(string filePath, out SerializedFileInfo fileInfo) + { + fileInfo = null; + + if (!File.Exists(filePath)) + { + Console.Error.WriteLine($"Error: File not found: {filePath}"); + return false; + } + + if (ArchiveDetector.IsUnityArchive(filePath)) + { + Console.Error.WriteLine($"Error: The file is an AssetBundle or other Unity Archive, not a SerializedFile."); + Console.Error.WriteLine($"File: {filePath}"); + Console.Error.WriteLine(); + Console.Error.WriteLine("Unity Archives contain SerializedFiles inside them."); + Console.Error.WriteLine("To access the SerializedFiles, first extract the archive using:"); + Console.Error.WriteLine($" UnityDataTool archive extract \"{filePath}\" -o "); + Console.Error.WriteLine(); + Console.Error.WriteLine("Then you can run serialized-file commands on the extracted files."); + return false; + } + + if (YamlSerializedFileDetector.IsYamlSerializedFile(filePath)) + { + Console.Error.WriteLine($"Error: The file is a YAML-format SerializedFile, which is not supported."); + Console.Error.WriteLine($"File: {filePath}"); + Console.Error.WriteLine(); + Console.Error.WriteLine("UnityDataTool only supports binary-format SerializedFiles."); + return false; + } + + if (!SerializedFileDetector.TryDetectSerializedFile(filePath, out fileInfo)) + { + Console.Error.WriteLine($"Error: The file does not appear to be a valid Unity SerializedFile."); + Console.Error.WriteLine($"File: {filePath}"); + return false; + } + + return true; + } + private static void OutputExternalRefsText(SerializedFile sf) { var refs = sf.ExternalReferences; @@ -135,5 +199,30 @@ private static string GetTypeName(SerializedFile sf, ObjectInfo obj) return TypeIdRegistry.GetTypeName(obj.TypeId); } } -} + private static void OutputHeaderText(SerializedFileInfo info) + { + Console.WriteLine($"{"Version",-20} {info.Version}"); + Console.WriteLine($"{"Format",-20} {(info.IsLegacyFormat ? "Legacy (32-bit)" : "Modern (64-bit)")}"); + Console.WriteLine($"{"File Size",-20} {info.FileSize:N0} bytes"); + Console.WriteLine($"{"Metadata Size",-20} {info.MetadataSize:N0} bytes"); + Console.WriteLine($"{"Data Offset",-20} {info.DataOffset:N0}"); + Console.WriteLine($"{"Endianness",-20} {(info.Endianness == 0 ? "Little Endian" : "Big Endian")}"); + } + + private static void OutputHeaderJson(SerializedFileInfo info) + { + var jsonObject = new + { + version = info.Version, + format = info.IsLegacyFormat ? "Legacy (32-bit)" : "Modern (64-bit)", + fileSize = info.FileSize, + metadataSize = info.MetadataSize, + dataOffset = info.DataOffset, + endianness = info.Endianness == 0 ? "Little Endian" : "Big Endian" + }; + + var json = JsonSerializer.Serialize(jsonObject, new JsonSerializerOptions { WriteIndented = true }); + Console.WriteLine(json); + } +}