Fuzzy Search With Multiple Items Table Return in SQLite

In Fuzzy Search, Pattern Matching and Retrieval Using Python and SQLite I reviwed several tools that could be used to support fuzzy search, giving an example of a scalr SQLite custom application function that could be used to retrieve a document containing a fuzzy matched search term.

In Custom SQLite Functions That Return A Table: A SQLite Spell-Checker, I showed how we can create functions that can return multiple values via a virtual table.

In this post, I’ll describe a couple of custom virtual table generating functions that support fuzzy searching within document records and the return of document fragments, one per row, associated with the matches.

The trick to creating table returning functions is provided by Charles Leifer’s coleifer/sqlite-vtfunc package, as described in SQLite Table-Valued Functions with Python. The demos for that package show how to create a regular expression search function:

#https://charlesleifer.com/blog/sqlite-table-valued-functions-with-python/
#https://github.com/coleifer/sqlite-vtfunc
#%pip install git+https://github.com/coleifer/sqlite-vtfunc.git

from vtfunc import TableFunction
import re

from vtfunc import TableFunction

class RegexSearch(TableFunction):
    params = ['pattern', 'search_string']
    columns = ['match']
    name = 'regex_search'

    def initialize(self, pattern=None, search_string=None):
        self._iter = re.finditer(pattern, search_string)

    def iterate(self, idx):
        # We do not need `idx`, so just ignore it.
        return (next(self._iter).group(0),)

# The `db` object is a sqlite_utls Database object
from sqlite_utils import Database
db = Database('test.db')

# Register the function with SQLite
RegexSearch.register(db.conn)

Here’s what happens if we try it out:

results = re.finditer(
            '<a[^\>]+?href="([^\"]+?)"[^\>]*?>([^\<]+?)</a>',
            'Junk  <a href="http://www.bbc.co.uk">Link text</a> wrapper; Junk  <a href="http://www.open.co.uk">Link text</a> wrapper')

next(results).groups(), next(results).groups()

"""
(('http://www.bbc.co.uk', 'Link text'), ('http://www.open.co.uk', 'Link text'))
"""

It’s not too hard to crib this to create a partial regex matcher using the regex package. We can extend the returned table to give the start and end character locations of the matched term in the response table:

#%pip install regex
import regex

class RegexSearch2(TableFunction):
    params = ['pattern', 'search_string']
    columns = ['match', 'start', 'end']
    name = 'regex_search2'

    def initialize(self, pattern=None, search_string=None):
        self._iter = regex.finditer(pattern, search_string)

    def iterate(self, idx):
        item = next(self._iter)
        return (item.group(0), item.start(), item.end(),)

RegexSearch2.register(db.conn)

Here’s how it works:

query_params = ("(sin eater){e<=1}", "The bin eater is a sin-eater thing...")

for i in db.execute('SELECT * FROM regex_search2(?, ?);', query_params):
    print(i)

"""
('bin eater', 4, 13)
('sin-eater', 19, 28)
"""

We can also create fuzzy matchers using the fuzzysearch.find_near_matches() function:

#%pip install fuzzysearch
from fuzzysearch import find_near_matches

class FindNearMatches(TableFunction):
    params = ['target_string', 'search_string']
    columns = ['matched', 'start', 'end', 'dist']
    name = 'find_near_matches'

    def initialize(self, target_string=None, search_string=None):
        self._iter = iter(find_near_matches(target_string, search_string, max_l_dist=2))

    def iterate(self, idx):
        r = next(self._iter)
        return (r.matched, r.start, r.end, r.dist,)

# And register the function
FindNearMatches.register(db.conn)

Here’s how it looks:

query_params = ('PATTERN', '--PRTTERN-PATERN---PATEN----PATN--PATREN-')

for i in db.execute('SELECT *, matched FROM find_near_matches(?, ?);', query_params):
    print(i)

"""
('PRTTERN', 2, 9, 1, 'PRTTERN')
('PATERN', 10, 16, 1, 'PATERN')
('PATEN', 19, 24, 2, 'PATEN')
('PATREN', 34, 40, 2, 'PATREN')
"""

One disadvantage of the above is that we have hardwired the max_l_dist parameter; but we can add in any number of additional parameters as we require:

class FindNearMatches2(TableFunction):
    params = ['target_string', 'search_string', 'max_l_dist']
    columns = ['matched', 'start', 'end', 'dist']
    name = 'find_near_matches2'

    def initialize(self, target_string=None, search_string=None, max_l_dist=2):
        self._iter = iter(find_near_matches(target_string, search_string, max_l_dist=max_l_dist))

    def iterate(self, idx):
        r = next(self._iter)
        return (r.matched, r.start, r.end, r.dist,)

# And register the function
FindNearMatches2.register(db.conn)

Here’s how it looks this time:

query_params = ('PATTERN', '--PRTTERN-PATERN---PATEN----PATN--PATREN-', 1)
for i in db.execute('SELECT *, matched FROM find_near_matches2(?, ?, ?);', query_params):
    print(i)

"""
('PRTTERN', 2, 9, 1, 'PRTTERN')
('PATERN', 10, 16, 1, 'PATERN')
"""

We can, of course, use the functions as part of a query that passes in the text values from other tables. So for example, here is a test table:

fuzzytests = db["fuzzytests"]
 
# Add some rows
fuzzytests.insert({'id':1,
                   'text':'Some text about the bin-eater.'})
fuzzytests.insert({'id':2,
                   'text':'Something about sin eating'})
fuzzytests.insert({'id':3,
                   'text':'Nothing to see here...'})

And we can run some queries against it:

# Run our fuzzy searcher over the rows
for i in db.execute(f'SELECT * FROM fuzzytests, find_near_matches2("sin eater", fuzzytests.text, 2);'):
    print(i)

"""
(1, 'Some text about the bin-eater.', 'bin-eater', 20, 29, 2)
(2, 'Something about sin eating', 'sin eatin', 16, 25, 2)
"""

Other fuzzy search tools are available, of course. So here’s an example using the gandersen101/spaczz fuzzy matcher for spaCy:

from spaczz.matcher import FuzzyMatcher

nlp = en_core_web_sm.load()

class SQLite_FuzzyMatcher(TableFunction):
    params = ['target_string', 'search_string']
    columns = ['matched', 'start', 'end', 'fragment', 'ratio']
    name = 'find_fuzzy_matches'
    
    def initialize(self, target_string=None, search_string=None):
        doc = nlp( search_string )
        matcher = FuzzyMatcher(nlp.vocab)
        # We can tune the sensitivity for the match; default is 75
        matcher.add(search_string, [nlp(target_string)], kwargs=[{"min_r2": 75}])
        self._iter = iter(matcher(doc))

    def iterate(self, idx):
        # ratio is a quality score for the match
        matched, start, end, ratio = next(self._iter)
        fragment = nlp(matched)[start:end].text
        return (matched, start, end, fragment, ratio,)

# And register the function
SQLite_FuzzyMatcher.register(db.conn)

And when we run it?

_q = f"""
SELECT *
FROM fuzzytests, find_fuzzy_matches('sin eater', fuzzytests.text) AS fz ;
"""

list(db.query(_q))

"""
[{'id': 1,
  'text': 'Some text about the bin-eater.',
  'matched': 'sin eater',
  'start': 4,
  'end': 7,
  'ratio': 78}]
"""

Next up – a porter stemmer search tool, maybe?!

Author: Tony Hirst

I'm a Senior Lecturer at The Open University, with an interest in #opendata policy and practice, as well as general web tinkering...

%d bloggers like this: