I pulled out a few bits of code and patched it together so a test case
does the same kind of math as the real deal. (Don't be a style freak -
it's demo fragment squished to fit in a Usenet posting.)
Machine:
MacOS X 10.5.8
Darwin desktop.pixelmemory.us 9.8.0 Darwin Kernel Version 9.8.0: Wed Jul
15 16:55:01 PDT 2009; root:xnu-1228.15.4~1/RELEASE_I386 i386
---------------
Java 1.5
Version:
java version "1.5.0_20"
Java(TM) 2 Runtime Environment, Standard Edition (build 1.5.0_20-b02-308)
Java HotSpot(TM) 64-Bit Server VM (build 1.5.0_19-137, mixed mode)
Options: -d64 -mx2G
Output:
Millis: 5979.04
Millis: 5984.168
Millis: 5987.027
Millis: 5979.992
Millis: 5953.974
---------------
Java 1.6
Version:
java version "1.6.0_15"
Java(TM) SE Runtime Environment (build 1.6.0_15-b02-215)
Java HotSpot(TM) 64-Bit Server VM (build 14.1-b02-87, mixed mode)
Options: -d64 -mx2G
Output:
Millis: 6943.407
Millis: 6937.324
Millis: 6917.524
Millis: 6931.662
Millis: 6917.065
---------------
public class Benchmark
{
static final int s_offR= 16, s_offG= 8, s_offB= 0, s_offA= 24;
static final int s_maskR= 0xff0000, s_maskG= 0xff00,
s_maskB= 0xff, s_maskA= 0xff000000;
public static void main (final String args[])
{
final Benchmark b= new Benchmark ();
b.rasterize();
for (int restest= 0; restest < 5; ++restest)
{
final long start= System.nanoTime();
for (int i= 0; i < 1000; ++i)
b.rasterize();
final long end= System.nanoTime();
System.out.println("Millis: " + (end - start) / 1000000d);
}
}
final int m_src[], m_dst[];
final int m_srcYstride, m_dstYstride;
float m_R, m_G, m_B, m_A;
public Benchmark ()
{
m_src= new int [640 * 480];
m_dst= new int [320 * 240];
m_srcYstride= 640;
m_dstYstride= 320;
}
void rasterize ()
{
final short kerns[][][]= new short[][][] {
{{300, 134, -23, 121}, {234, 45, 12, -18},
{37, -86, 7, 0}, {4, 86, -13, 197}},
{{300, 134, -23, 123}, {45, 234, 12, -20},
{37, -54, 7, 0}, {4, 54, -13, 197}}
};
final int sum= 1069;
for (int srcY= 0, dstY= 0, ySrcScan= 0;
srcY < (480 - 4);
srcY+= 2, ++dstY, ySrcScan+= 2*m_srcYstride)
{
for (int srcX= 0, dstX= 0; srcX < (640 - 4); srcX+= 2, ++dstX)
{
m_R= m_G= m_B= m_A= 0;
read4x4 (kerns[srcY & 1], m_src[ySrcScan + srcX], 0, 0);
writePixel (sum, dstX, dstY);
}
}
}
final void writePixel
(final float kernelSum, final int dstX, final int dstY)
{
final int v;
if (m_A > 0)
{
final int r= (int)(m_R / m_A);
final int g= (int)(m_G / m_A);
final int b= (int)(m_B / m_A);
final int a= (int)(m_A / kernelSum);
v= (((r < 0) ? 0 : ((r > 255) ? 255 : r) << s_offR) & s_maskR)
| (((g < 0) ? 0 : ((g > 255) ? 255 : g) << s_offG) & s_maskG)
| (((b < 0) ? 0 : ((b > 255) ? 255 : b) << s_offB) & s_maskB)
| (((a < 0) ? 0 : ((a > 255) ? 255 : a) << s_offA) & s_maskA);
}
else
v= 0;
m_dst[dstX + dstY * m_dstYstride]= v;
}
final void read4x4
(final short[][] array, final int scan, final int kx, final int ky)
{
int R = 0, G = 0, B = 0, A = 0;
final short k0[]= array[ky];
final short k1[]= array[ky + 1];
final short k2[]= array[ky + 2];
final short k3[]= array[ky + 3];
{
final short k = k0[kx + 0];
final int value = m_src[scan];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R= alphaMult * ((value & s_maskR) >>> s_offR);
G= alphaMult * ((value & s_maskG) >>> s_offG);
B= alphaMult * ((value & s_maskB) >>> s_offB);
A= alphaMult;
}
{
final short k = k0[kx + 1];
final int value = m_src[scan + 1];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k0[kx + 2];
final int value = m_src[scan + 2];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k0[kx + 3];
final int value = m_src[scan + 3];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k1[kx + 0];
final int value = m_src[scan + m_srcYstride];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k1[kx + 1];
final int value = m_src[scan + m_srcYstride + 1];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k1[kx + 2];
final int value = m_src[scan + m_srcYstride + 2];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k1[kx + 3];
final int value = m_src[scan + m_srcYstride + 3];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k2[kx + 0];
final int value = m_src[scan + m_srcYstride + m_srcYstride];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k2[kx + 1];
final int value = m_src[scan + m_srcYstride + m_srcYstride + 1];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k2[kx + 2];
final int value = m_src[scan + m_srcYstride + m_srcYstride + 2];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k2[kx + 3];
final int value = m_src[scan + m_srcYstride + m_srcYstride + 3];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k3[kx + 0];
final int value = m_src[scan + 3 * m_srcYstride];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k3[kx + 1];
final int value = m_src[scan + 3 * m_srcYstride + 1];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k3[kx + 2];
final int value = m_src[scan + 3 * m_srcYstride + 2];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
{
final short k = k3[kx + 3];
final int value = m_src[scan + 3 * m_srcYstride + 3];
final int alphaMult = ((value & s_maskA) >>> s_offA) * k;
R += alphaMult * ((value & s_maskR) >>> s_offR);
G += alphaMult * ((value & s_maskG) >>> s_offG);
B += alphaMult * ((value & s_maskB) >>> s_offB);
A += alphaMult;
}
m_R+= R;
m_G+= G;
m_B+= B;
m_A+= A;
}
}