Similar Names Matching
This is the code that performs similar name matching on the Tasmanian Court Lists pages. It uses the Phonix package available on Nuget to perform matching using Double Metaphone algorithm. Some notes on its operation:
- I am only matching when both names contain at least three components. Lowering this number produces a lot of false positives when someone with a common name like John Smith is encountered without other names.
- The algorithm matches some names such as Kelly and Cole that are unlikely to be related. I might do some further work on improving the matching.
- Names appearing on the court list pages aren't unique and may not be the person you are seeking. The same applies even moreso to similar names, many will be false positives or a different individual that happens to have a similar name.
C# Source Code
void Main()
{
const int MinElementsToMatch = 3;
var names = Names
.Select(n => new { Id = n.NameId, n.Name })
.ToDictionary(n => n.Id, p => n.Name);
var compare = new DoubleMetaphone();
foreach (var sourceName in names)
{
var sourceComponents = ParseName(sourceName.Value);
if (sourceComponents.Count() >= MinElementsToMatch)
{
foreach (var testName in names.Where(tn => tn.Key != sourceName.Key))
{
var testComponents = ParseName(testName.Value);
var componentsToCheck = Math.Min(sourceComponents.Count(), testComponents.Count());
if (componentsToCheck >= MinElementsToMatch)
{
var matchCount = 0;
for (var i = 0; i < componentsToCheck; i++)
{
if (compare.IsSimilar(new string[] { sourceComponents[i], testComponents[i]}))
{
matchCount++;
}
}
if (matchCount == componentsToCheck)
{
Console.WriteLine($"{matchCount} {sourceName.Value} *** {testName.Value}");
DuplicateNames.InsertOnSubmit(new DuplicateName { NameId = sourceName.Key, DuplicateNameId = testName.Key});
SubmitChanges();
}
}
}
}
}
}
string[] ParseName(string name)
{
var commaPos = name.IndexOf(",");
if (commaPos >= 0)
{
var lastName = name.Substring(0, commaPos);
var otherNames = name.Substring(commaPos + 1).Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);
return new string[] {lastName}.Concat(otherNames).ToArray();
}
else
{
return name.Split(' ');
}
}