Skip to content

Commit

Permalink
Merge pull request #1817 from SixLabors/bp/sse4X4
Browse files Browse the repository at this point in the history
Add SSE2 version of Vp8Sse4X4
  • Loading branch information
JimBobSquarePants authored Nov 10, 2021
2 parents 255226b + 1997d59 commit 7d74c4c
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 39 deletions.
66 changes: 53 additions & 13 deletions src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,63 @@ internal static class LossyUtils
private static readonly Vector128<byte> Mean16x4Mask = Vector128.Create((short)0x00ff).AsByte();
#endif

// Note: method name in libwebp reference implementation is called VP8SSE16x16.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X16(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 16);
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 16);

// Note: method name in libwebp reference implementation is called VP8SSE16x8.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse16X8(Span<byte> a, Span<byte> b) => GetSse(a, b, 16, 8);
public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 8);

// Note: method name in libwebp reference implementation is called VP8SSE4x4.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Sse4X4(Span<byte> a, Span<byte> b) => GetSse(a, b, 4, 4);
public static int Vp8_Sse4X4(Span<byte> a, Span<byte> b)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse2.IsSupported)
{
// Load values.
ref byte aRef = ref MemoryMarshal.GetReference(a);
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref aRef);
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps));
Vector128<byte> a2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2));
Vector128<byte> a3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3));
ref byte bRef = ref MemoryMarshal.GetReference(b);
Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref bRef);
Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps));
Vector128<byte> b2 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2));
Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3));

// Combine pair of lines.
Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());

// Convert to 16b.
Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);

// subtract, square and accumulate.
Vector128<byte> d0 = Sse2.SubtractSaturate(a01s, b01s);
Vector128<byte> d1 = Sse2.SubtractSaturate(a23s, b23s);
Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16());
Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16());
Vector128<int> sum = Sse2.Add(e0, e1);

return Numerics.ReduceSum(sum);
}
else
#endif
{
return Vp8_SseNxN(a, b, 4, 4);
}
}

[MethodImpl(InliningOptions.ShortMethod)]
public static int GetSse(Span<byte> a, Span<byte> b, int w, int h)
public static int Vp8_SseNxN(Span<byte> a, Span<byte> b, int w, int h)
{
int count = 0;
int aOffset = 0;
Expand Down Expand Up @@ -88,7 +134,7 @@ public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w, Span<i
#if SUPPORTS_RUNTIME_INTRINSICS
if (Sse41.IsSupported)
{
int diffSum = TTransformSse41(a, b, w, scratch);
int diffSum = TTransformSse41(a, b, w);
return Math.Abs(diffSum) >> 5;
}
else
Expand Down Expand Up @@ -615,11 +661,8 @@ public static int TTransform(Span<byte> input, Span<ushort> w, Span<int> scratch
/// Returns the weighted sum of the absolute value of transformed coefficients.
/// w[] contains a row-major 4 by 4 symmetric matrix.
/// </summary>
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w, Span<int> scratch)
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
{
Span<int> sum = scratch.Slice(0, 4);
sum.Clear();

// Load and combine inputs.
Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
Vector128<byte> ina1 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA.Slice(WebpConstants.Bps, 16)));
Expand Down Expand Up @@ -724,9 +767,7 @@ public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ush
// difference of weighted sums.
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());

ref int outputRef = ref MemoryMarshal.GetReference(sum);
Unsafe.As<int, Vector128<int>>(ref outputRef) = result.AsInt32();
return sum[3] + sum[2] + sum[1] + sum[0];
return Numerics.ReduceSum(result);
}
#endif

Expand All @@ -739,7 +780,6 @@ public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scrat
public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch)
{
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
int tmpOffset = 0;
for (int srcOffset = 0; srcOffset < 4; srcOffset++)
{
Expand Down
16 changes: 6 additions & 10 deletions src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ public static void PickBestIntra16(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Se
rdCur.Nz = (uint)ReconstructIntra16(it, dqm, rdCur, tmpDst, mode);

// Measure RD-score.
rdCur.D = LossyUtils.Vp8Sse16X16(src, tmpDst);
rdCur.D = LossyUtils.Vp8_Sse16X16(src, tmpDst);
rdCur.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto16X16(src, tmpDst, WeightY, scratch)) : 0;
rdCur.H = WebpConstants.Vp8FixedCostsI16[mode];
rdCur.R = it.GetCostLuma16(rdCur, proba, res);
Expand Down Expand Up @@ -160,7 +160,7 @@ public static bool PickBestIntra4(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Seg
rdTmp.Nz = (uint)ReconstructIntra4(it, dqm, tmpLevels, src, tmpDst, mode);

// Compute RD-score.
rdTmp.D = LossyUtils.Vp8Sse4X4(src, tmpDst);
rdTmp.D = LossyUtils.Vp8_Sse4X4(src, tmpDst);
rdTmp.SD = tlambda != 0 ? Mult8B(tlambda, LossyUtils.Vp8Disto4X4(src, tmpDst, WeightY, scratch)) : 0;
rdTmp.H = modeCosts[mode];

Expand Down Expand Up @@ -251,7 +251,7 @@ public static void PickBestUv(Vp8EncIterator it, ref Vp8ModeScore rd, Vp8Segment
rdUv.Nz = (uint)ReconstructUv(it, dqm, rdUv, tmpDst, mode);

// Compute RD-score
rdUv.D = LossyUtils.Vp8Sse16X8(src, tmpDst);
rdUv.D = LossyUtils.Vp8_Sse16X8(src, tmpDst);
rdUv.SD = 0; // not calling TDisto here: it tends to flatten areas.
rdUv.H = WebpConstants.Vp8FixedCostsUv[mode];
rdUv.R = it.GetCostUv(rdUv, proba, res);
Expand Down Expand Up @@ -340,8 +340,6 @@ public static int ReconstructIntra4(Vp8EncIterator it, Vp8SegmentInfo dqm, Span<
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
Span<short> tmp = it.Scratch2.AsSpan(0, 16);
Span<int> scratch = it.Scratch3.AsSpan(0, 16);
tmp.Clear();
scratch.Clear();
Vp8Encoding.FTransform(src, reference, tmp, scratch);
int nz = QuantizeBlock(tmp, levels, ref dqm.Y1);
Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch);
Expand All @@ -357,8 +355,6 @@ public static int ReconstructUv(Vp8EncIterator it, Vp8SegmentInfo dqm, Vp8ModeSc
int n;
Span<short> tmp = it.Scratch2.AsSpan(0, 8 * 16);
Span<int> scratch = it.Scratch3.AsSpan(0, 16);
tmp.Clear();
scratch.Clear();

for (n = 0; n < 8; n += 2)
{
Expand Down Expand Up @@ -411,7 +407,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
{
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I16ModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);
long score = (LossyUtils.Vp8_Sse16X16(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsI16[mode] * lambdaDi16);

if (mode > 0 && WebpConstants.Vp8FixedCostsI16[mode] > bitLimit)
{
Expand Down Expand Up @@ -458,7 +454,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
for (mode = 0; mode < WebpConstants.NumBModes; ++mode)
{
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8I4ModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
long score = (LossyUtils.Vp8_Sse4X4(src, reference) * WebpConstants.RdDistoMult) + (modeCosts[mode] * lambdaDi4);
if (score < bestI4Score)
{
bestI4Mode = mode;
Expand Down Expand Up @@ -507,7 +503,7 @@ public static void RefineUsingDistortion(Vp8EncIterator it, Vp8SegmentInfo[] seg
for (mode = 0; mode < WebpConstants.NumPredModes; ++mode)
{
Span<byte> reference = it.YuvP.AsSpan(Vp8Encoding.Vp8UvModeOffsets[mode]);
long score = (LossyUtils.Vp8Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
long score = (LossyUtils.Vp8_Sse16X8(src, reference) * WebpConstants.RdDistoMult) + (WebpConstants.Vp8FixedCostsUv[mode] * lambdaDuv);
if (score < bestUvScore)
{
bestMode = mode;
Expand Down
3 changes: 0 additions & 3 deletions src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ public static void ITransformOne(Span<byte> reference, Span<short> input, Span<b
{
int i;
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();
for (i = 0; i < 4; i++)
{
// vertical pass.
Expand Down Expand Up @@ -124,7 +123,6 @@ public static void FTransform(Span<byte> src, Span<byte> reference, Span<short>
{
int i;
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();

int srcIdx = 0;
int refIdx = 0;
Expand Down Expand Up @@ -163,7 +161,6 @@ public static void FTransform(Span<byte> src, Span<byte> reference, Span<short>
public static void FTransformWht(Span<short> input, Span<short> output, Span<int> scratch)
{
Span<int> tmp = scratch.Slice(0, 16);
tmp.Clear();

int i;
int inputIdx = 0;
Expand Down
1 change: 0 additions & 1 deletion src/ImageSharp/Formats/Webp/Lossy/Vp8Histogram.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ public void CollectHistogram(Span<byte> reference, Span<byte> pred, int startBlo
this.distribution.AsSpan().Clear();
for (j = startBlock; j < endBlock; j++)
{
this.output.AsSpan().Clear();
this.Vp8FTransform(reference.Slice(WebpLookupTables.Vp8DspScan[j]), pred.Slice(WebpLookupTables.Vp8DspScan[j]), this.output);

// Convert coefficients to bin.
Expand Down
17 changes: 5 additions & 12 deletions src/ImageSharp/Formats/Webp/Lossy/Vp8ModeScore.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,18 +97,11 @@ public Vp8ModeScore()

public void Clear()
{
this.YDcLevels.AsSpan().Clear();
this.YAcLevels.AsSpan().Clear();
this.UvLevels.AsSpan().Clear();
this.ModesI4.AsSpan().Clear();

for (int i = 0; i < 2; i++)
{
for (int j = 0; j < 3; j++)
{
this.Derr[i, j] = 0;
}
}
Array.Clear(this.YDcLevels, 0, this.YDcLevels.Length);
Array.Clear(this.YAcLevels, 0, this.YAcLevels.Length);
Array.Clear(this.UvLevels, 0, this.UvLevels.Length);
Array.Clear(this.ModesI4, 0, this.ModesI4.Length);
Array.Clear(this.Derr, 0, this.Derr.Length);
}

public void InitScore()
Expand Down
38 changes: 38 additions & 0 deletions tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,35 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP
[Trait("Format", "Webp")]
public class LossyUtilsTests
{
private static void RunVp8Sse4X4Test()
{
byte[] a =
{
27, 27, 28, 29, 29, 28, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129,
129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 27, 27, 27, 27, 27, 27, 27, 28, 28, 29, 29, 28,
28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 27, 27, 26,
26, 26, 26, 27, 27, 27, 28, 28, 29, 29, 28, 28, 27, 129, 129, 129, 129, 129, 129, 129, 129, 128,
128, 128, 128, 128, 128, 128, 128, 28, 27, 27, 26, 26, 27, 27, 28, 27, 28, 28, 29, 29, 28, 28, 27,
129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128
};

byte[] b =
{
26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 26, 26, 26,
26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 26, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204
};

int expected = 27;

int actual = LossyUtils.Vp8_Sse4X4(a, b);

Assert.Equal(expected, actual);
}

private static void RunMean16x4Test()
{
// arrange
Expand Down Expand Up @@ -61,13 +90,22 @@ private static void RunHadamardTransformTest()
Assert.Equal(expected, actual);
}

[Fact]
public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test();

[Fact]
public void Mean16x4_Works() => RunMean16x4Test();

[Fact]
public void HadamardTransform_Works() => RunHadamardTransformTest();

#if SUPPORTS_RUNTIME_INTRINSICS
[Fact]
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);

[Fact]
public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic);

[Fact]
public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);

Expand Down

0 comments on commit 7d74c4c

Please sign in to comment.