Compiler Explorer

Source code

using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.Wasm;
using System.Runtime.Intrinsics.X86;
public class C {
        private static void EncodeToUtf16_Vector128_pr(ReadOnlySpan<byte> bytes, Span<char> chars, bool casing)
        {
            Debug.Assert(bytes.Length >= Vector128<int>.Count);

ref byte srcRef = ref MemoryMarshal.GetReference(bytes);
            ref ushort destRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(chars));

Vector128<byte> hexMap = casing ?
                Vector128.Create((byte)'0', (byte)'1', (byte)'2', (byte)'3',
                                 (byte)'4', (byte)'5', (byte)'6', (byte)'7',
                                 (byte)'8', (byte)'9', (byte)'A', (byte)'B',
                                 (byte)'C', (byte)'D', (byte)'E', (byte)'F') :
                Vector128.Create((byte)'0', (byte)'1', (byte)'2', (byte)'3',
                                 (byte)'4', (byte)'5', (byte)'6', (byte)'7',
                                 (byte)'8', (byte)'9', (byte)'a', (byte)'b',
                                 (byte)'c', (byte)'d', (byte)'e', (byte)'f');

nuint pos = 0;
            nuint lengthSubVector128 = (nuint)bytes.Length - (nuint)Vector128<int>.Count;
            do
            {
                // This implementation processes 4 bytes of input at once, it can be easily modified
                // to support 16 bytes at once, but that didn't demonstrate noticeable wins
                // for Converter.ToHexString (around 8% faster for large inputs) so
                // it focuses on small inputs instead.

uint i32 = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, pos));
                Vector128<byte> vec = Vector128.CreateScalar(i32).AsByte();

// JIT is expected to eliminate all unused calculations
                (Vector128<byte> hexLow, _) = AsciiToHexVector128(vec, hexMap);
                (Vector128<ushort> v0, _) = Vector128.Widen(hexLow);

v0.StoreUnsafe(ref destRef, pos * 2);

if (pos == lengthSubVector128)
                {
                    return;
                }

pos += (nuint)Vector128<int>.Count;

// Overlap with the current chunk for trailing elements
                if (pos > lengthSubVector128)
                {
                    pos = lengthSubVector128;
                }

} while (true);
        }
    private static void EncodeToUtf16_Vector128_base(ReadOnlySpan<byte> bytes, Span<char> chars, bool casing)
    {
        Debug.Assert(bytes.Length >= Vector128<int>.Count);

ref byte srcRef = ref MemoryMarshal.GetReference(bytes);
        ref ushort destRef = ref Unsafe.As<char, ushort>(ref MemoryMarshal.GetReference(chars));

Vector128<byte> hexMap = casing ?
            Vector128.Create((byte)'0', (byte)'1', (byte)'2', (byte)'3',
                             (byte)'4', (byte)'5', (byte)'6', (byte)'7',
                             (byte)'8', (byte)'9', (byte)'A', (byte)'B',
                             (byte)'C', (byte)'D', (byte)'E', (byte)'F') :
            Vector128.Create((byte)'0', (byte)'1', (byte)'2', (byte)'3',
                             (byte)'4', (byte)'5', (byte)'6', (byte)'7',
                             (byte)'8', (byte)'9', (byte)'a', (byte)'b',
                             (byte)'c', (byte)'d', (byte)'e', (byte)'f');

nuint pos = 0;
        nuint lengthSubVector128 = (nuint)bytes.Length - (nuint)Vector128<int>.Count;
        do
        {
            // This implementation processes 4 bytes of input at once, it can be easily modified
            // to support 16 bytes at once, but that didn't demonstrate noticeable wins
            // for Converter.ToHexString (around 8% faster for large inputs) so
            // it focuses on small inputs instead.

uint i32 = Unsafe.ReadUnaligned<uint>(ref Unsafe.Add(ref srcRef, pos));
            Vector128<byte> vec = Vector128.CreateScalar(i32).AsByte();

// JIT is expected to eliminate all unused calculations
            (Vector128<byte> hexLow, _) = AsciiToHexVector128(vec, hexMap);
            (Vector128<ushort> v0, _) = Vector128.Widen(hexLow);

v0.StoreUnsafe(ref destRef, pos * 2);

pos += (nuint)Vector128<int>.Count;
            if (pos == (nuint)bytes.Length)
            {
                return;
            }

// Overlap with the current chunk for trailing elements
            if (pos > lengthSubVector128)
            {
                pos = lengthSubVector128;
            }

} while (true);
    }

[MethodImpl(MethodImplOptions.AggressiveInlining)]
    internal static (Vector128<byte>, Vector128<byte>) AsciiToHexVector128(Vector128<byte> src, Vector128<byte> hexMap)
    {
        Debug.Assert(Ssse3.IsSupported || AdvSimd.Arm64.IsSupported);
        // The algorithm is simple: a single srcVec (contains the whole 16b Guid) is converted
        // into nibbles and then, via hexMap, converted into a HEX representation via
        // Shuffle(nibbles, srcVec). ASCII is then expanded to UTF-16.
        Vector128<byte> shiftedSrc = Vector128.ShiftRightLogical(src.AsUInt64(), 4).AsByte();
        Vector128<byte> lowNibbles = UnpackLow(shiftedSrc, src);
        Vector128<byte> highNibbles = UnpackHigh(shiftedSrc, src);

return (ShuffleUnsafe(hexMap, lowNibbles & Vector128.Create((byte)0xF)),
            ShuffleUnsafe(hexMap, highNibbles & Vector128.Create((byte)0xF)));
    }
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    internal static Vector128<byte> UnpackLow(Vector128<byte> left, Vector128<byte> right)
    {
        if (Sse2.IsSupported)
        {
            return Sse2.UnpackLow(left, right);
        }
        else if (!AdvSimd.Arm64.IsSupported)
        {
            throw null!;
        }
        return AdvSimd.Arm64.ZipLow(left, right);
    }
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    internal static Vector128<byte> UnpackHigh(Vector128<byte> left, Vector128<byte> right)
    {
        if (Sse2.IsSupported)
        {
            return Sse2.UnpackHigh(left, right);
        }
        else if (!AdvSimd.Arm64.IsSupported)
        {
            throw null!;
        }
        return AdvSimd.Arm64.ZipHigh(left, right);
    }
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    internal static Vector128<byte> ShuffleUnsafe(Vector128<byte> vector, Vector128<byte> indices)
    {
        if (Ssse3.IsSupported)
        {
            return Ssse3.Shuffle(vector, indices);
        }

if (AdvSimd.Arm64.IsSupported)
        {
            return AdvSimd.Arm64.VectorTableLookup(vector, indices);
        }

if (PackedSimd.IsSupported)
        {
            return PackedSimd.Swizzle(vector, indices);
        }

return Vector128.Shuffle(vector, indices);
    }
}