Basically, my SQL CLR solution was the fastest. However it is very interesting to see that the SSIS solution was very close in speed. It also had fewer logical reads, which leads me to the thought that a well-written SSIS solution might be even faster than a SQL CLR solution. Maybe an idea for next round?
Also very interesting to see how good T-SQL is. A well-written T-SQL solution like Matt v1 is not easy to beat with SQL CLR as you can see from my first tries (JAhlen v1 and JAhlen v2).
Here is the source code of my SQL CLR solution.
public partial class StoredProcedures
{
internal struct IntArrayBucket
{
public int Count;
public int[] Arr;
public void Add(int ssn)
{
if (Count == 0)
Arr = new int[6];
else if (Count >= Arr.Length)
Array.Resize(ref Arr, Arr.Length * 2);
Arr[Count++] = ssn;
}
}
internal struct ResultRow : IComparable<ResultRow>
{
public int vSSN;
public int ueSSN;
public byte Status;
public ResultRow(int vssn, int uessn, byte status)
{
vSSN = vssn;
ueSSN = uessn;
Status = status;
}
public int CompareTo(ResultRow other)
{
if (this.vSSN == other.vSSN)
return this.ueSSN.CompareTo(other.ueSSN);
else
return this.vSSN.CompareTo(other.vSSN);
}
}
[Microsoft.SqlServer.Server.SqlProcedure]
public static void MatchProc()
{
// Validated SSNs
int[] vSSN = new int[10000];
// Split into evens and odds, use BitArrays for extra fast lookups
IntArrayBucket[] vSSNEven = new IntArrayBucket[100000];
IntArrayBucket[] vSSNOdd = new IntArrayBucket[10000];
BitArray vSSNEvenBits = new BitArray(100000);
BitArray vSSNOddBits = new BitArray(10000);
// Exact matches
int[] exactResults = new int[10000];
int exactResultsLength = 0;
// Matches with 1 or 2 digits difference
ResultRow[] diffResults = new ResultRow[301];
int diffResultsLength = 0;
using (var connection = new SqlConnection("context connection=true"))
{
connection.Open();
using (var cmd = new SqlCommand(
"SELECT CAST([SSN] AS INT) FROM dbo.[Validated]; SELECT CAST([SSN] AS INT) FROM dbo.[UserEntered]",
connection))
{
using (var rdr = cmd.ExecuteReader())
{
var i = 0;
var vSSNlength = vSSN.Length;
// Loop through first result set - validated SSNs
while (rdr.Read())
{
if (i >= vSSNlength)
{
// Increase vSSN array size if necessary
Array.Resize(ref vSSN, vSSNlength * 2);
vSSNlength = vSSN.Length;
}
int ssn = rdr.GetInt32(0);
vSSN[i] = ssn;
var even = GetEven(ssn);
var odd = GetOdd(ssn);
vSSNEven[even].Add(ssn);
vSSNOdd[odd].Add(ssn);
vSSNEvenBits.Set(even, true);
vSSNOddBits.Set(odd, true);
i++;
}
// Adjust vSSN array size if necessary
if (i != vSSN.Length)
Array.Resize(ref vSSN, i);
rdr.NextResult();
i = 0;
// Loop through second result set - user entered SSNs
while (rdr.Read())
{
var uessn = rdr.GetInt32(0);
var ueeven = GetEven(uessn);
var ueodd = GetOdd(uessn);
// Check first for exact match. Linear search works well with small volumes, but
// should be replaced by hash or binary search for larger volumes.
if (vSSNEvenBits[ueeven] && vSSNOddBits[ueodd] && Array.IndexOf<int>(vSSNEven[ueeven].Arr, uessn) >= 0)
{
if (exactResultsLength >= exactResults.Length)
{
// Increase exactResults array size if necessary
Array.Resize(ref exactResults, exactResults.Length * 2);
}
exactResults[exactResultsLength++] = uessn;
}
else
{
// Check for 1 or 2 digit difference matches
if (vSSNEvenBits[ueeven])
{
var bucket = vSSNEven[ueeven];
for (int j = 0; j < bucket.Count; j++)
{
var vssn = bucket.Arr[j];
var status = 0;
if (((vssn / 10) % 10) != ueodd % 10)
status++;
if (((vssn / 1000) % 10) != ((ueodd / 10) % 10))
status++;
if (((vssn / 100000) % 10) != ((ueodd / 100) % 10))
status++;
if (status > 2)
continue;
if (((vssn / 10000000) % 10) != ((ueodd / 1000) % 10))
status++;
if (status <= 2)
{
// Increase array sizes if necessary
if (diffResultsLength >= diffResults.Length)
{
Array.Resize(ref diffResults, diffResults.Length * 2);
}
diffResults[diffResultsLength].vSSN = vssn;
diffResults[diffResultsLength].ueSSN = uessn;
diffResults[diffResultsLength].Status = (byte)status;
diffResultsLength++;
}
}
}
if (vSSNOddBits[ueodd])
{
var bucket = vSSNOdd[ueodd];
for (int j = 0; j < bucket.Count; j++)
{
var vssn = bucket.Arr[j];
var status = 0;
if ((vssn % 10) != ueeven % 10)
status++;
if (((vssn / 100) % 10) != ((ueeven / 10) % 10))
status++;
if (((vssn / 10000) % 10) != ((ueeven / 100) % 10))
status++;
if (status > 2)
continue;
if (((vssn / 1000000) % 10) != ((ueeven / 1000) % 10))
status++;
if (status > 2)
continue;
if (((vssn / 100000000) % 10) != ((ueeven / 10000) % 10))
status++;
if (status <= 2)
{
// Increase array sizes if necessary
if (diffResultsLength >= diffResults.Length)
{
Array.Resize(ref diffResults, diffResults.Length * 2);
}
diffResults[diffResultsLength].vSSN = vssn;
diffResults[diffResultsLength].ueSSN = uessn;
diffResults[diffResultsLength].Status = (byte)status;
diffResultsLength++;
}
}
}
}
}
// Sorts the 1 or 2 digit difference matches.
Array.Sort<ResultRow>(diffResults, 0, diffResultsLength);
if (diffResultsLength >= diffResults.Length)
{
Array.Resize(ref diffResults, diffResults.Length * 2);
}
// Add an extra value at end of array that is larger than every vSSN
diffResults[diffResultsLength].vSSN = 1000000000;
}
}
}
// Prepare output
SqlDataRecord record = new SqlDataRecord(new SqlMetaData("vSSN", SqlDbType.Char, 9),
new SqlMetaData("ueSSN", SqlDbType.Char, 9), new SqlMetaData("Status", SqlDbType.TinyInt));
SqlContext.Pipe.SendResultsStart(record);
char[] buffer = new char[9];
// Now that we have two sorted lists (exactResults and diffResults), we can
// easily merge them together and output.
var diffIndex = 0;
var exactIndex = 0;
var nextDiffvSSN = diffResults[diffIndex].vSSN;
while (exactIndex < exactResultsLength)
{
var vssn = exactResults[exactIndex];
if (vssn < nextDiffvSSN)
{
Int32ToSqlChars(buffer, vssn);
record.SetChars(0, 0, buffer, 0, 9);
record.SetChars(1, 0, buffer, 0, 9);
record.SetByte(2, (byte)0);
SqlContext.Pipe.SendResultsRow(record);
exactIndex++;
}
else
{
SortedDictionary<int, byte> results = new SortedDictionary<int, byte>();
if (vssn == nextDiffvSSN)
{
results.Add(vssn, 0);
exactIndex++;
}
while (diffResults[diffIndex].vSSN == nextDiffvSSN)
{
results.Add(diffResults[diffIndex].ueSSN, diffResults[diffIndex].Status);
diffIndex++;
}
foreach (var result in results)
{
Int32ToSqlChars(buffer, nextDiffvSSN);
record.SetChars(0, 0, buffer, 0, 9);
Int32ToSqlChars(buffer, result.Key);
record.SetChars(1, 0, buffer, 0, 9);
record.SetByte(2, result.Value);
SqlContext.Pipe.SendResultsRow(record);
}
nextDiffvSSN = diffResults[diffIndex].vSSN;
}
}
while (diffIndex < diffResultsLength)
{
Int32ToSqlChars(buffer, diffResults[diffIndex].vSSN);
record.SetChars(0, 0, buffer, 0, 9);
Int32ToSqlChars(buffer, diffResults[diffIndex].ueSSN);
record.SetChars(1, 0, buffer, 0, 9);
record.SetByte(2, diffResults[diffIndex].Status);
SqlContext.Pipe.SendResultsRow(record);
diffIndex++;
}
// End
SqlContext.Pipe.SendResultsEnd();
}
private static void Int32ToSqlChars(char[] buffer, int ssn)
{
buffer[0] = (char)('0' + (ssn / 100000000));
buffer[1] = (char)('0' + ((ssn / 10000000)) % 10);
buffer[2] = (char)('0' + ((ssn / 1000000)) % 10);
buffer[3] = (char)('0' + ((ssn / 100000)) % 10);
buffer[4] = (char)('0' + ((ssn / 10000)) % 10);
buffer[5] = (char)('0' + ((ssn / 1000)) % 10);
buffer[6] = (char)('0' + ((ssn / 100)) % 10);
buffer[7] = (char)('0' + ((ssn / 10)) % 10);
buffer[8] = (char)('0' + (ssn % 10));
}
private static Int32 SqlCharsToInt32(SqlChars sqlChars)
{
var buf = sqlChars.Buffer;
return (buf[0] - '0') * 100000000 + (buf[1] - '0') * 10000000 + (buf[2] - '0') * 1000000 + (buf[3] - '0') * 100000 + (buf[4] - '0') * 10000 + (buf[5] - '0') * 1000 + (buf[6] - '0') * 100 + (buf[7] - '0') * 10 + (buf[8] - '0');
}
private static int GetEven(int num)
{
return (num % 10) +
((num / 100) % 10) * 10 +
((num / 10000) % 10) * 100 +
((num / 1000000) % 10) * 1000 +
((num / 100000000) % 10) * 10000;
}
private static int GetOdd(int num)
{
return ((num / 10) % 10) +
((num / 1000) % 10) * 10 +
((num / 100000) % 10) * 100 +
((num / 10000000) % 10) * 1000;
}
};