Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,6 @@ dev/data/geo_address.csv
dev/geo_matching.py
*.so
.coverage
simstring/site/
simstring/site/
tmp*/
addresses.csv
87 changes: 46 additions & 41 deletions dev/company_names.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,61 @@
# coding: utf-8

import os, sys

import numpy as np

from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
from simstring.measure.cosine import (
CosineMeasure,
) # , OverlapMeasure, LeftOverlapMeasure
from simstring.measure.cosine import CosineMeasure
from simstring.measure.overlap import OverlapMeasure, LeftOverlapMeasure

# from simstring.database.mongo import MongoDatabase
from simstring.database.dict import DictDatabase
from simstring.database.disk import DiskDatabase
from simstring.database.redis import RedisDatabase
from simstring.searcher import Searcher
from tqdm import tqdm

from pyinstrument import Profiler

profiler = Profiler()


def output_similar_strings_of_each_line(path, measure):
def output_similar_strings_of_each_line(path, measures, db_cls):
strings = []
with open(path, "r") as lines:
for line in lines:
strings.append(line.rstrip("\r\n"))

db = DictDatabase(CharacterNgramFeatureExtractor(2))
for string in strings:
strings.append(line.rstrip("\r\n").strip().lower())

db = make_db(db_cls, strings)

for measure in measures:
searcher = Searcher(db, measure)
profiler.start()

for string in strings:
result = searcher.search(string, 0.8)

profiler.stop()
print(result)
print(db_cls.__name__, measure.__class__.__name__)
profiler.print()

def make_db(db_cls, strings):
db = db_cls(CharacterNgramFeatureExtractor(2))
i = 0
for string in tqdm(strings):
db.add(string)

# db.save("companies.db")

# dbl = DictDatabase.load("companies.db")

searcher = Searcher(db, measure)
profiler.start()

for string in strings:
result = searcher.search(string, 0.8)
# result = [str(np.round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(string, 0.8)]
# print("\t".join([string, ",".join(result)]))

profiler.stop()
print(result)
profiler.print()
# profiler.open_in_browser()


measure = CosineMeasure()
output_similar_strings_of_each_line("dev/data/company_names.txt", measure)

# measure = OverlapMeasure()
# output_similar_strings_of_each_line("dev/data/company_names.txt", measure)

# measure = LeftOverlapMeasure()
# output_similar_strings_of_each_line("./data/company_names.txt", measure)
i += 1
if (i % 10000) == 0:
db.commit()
i = 0
db.commit()
return db

if __name__ =="__main__":
file = "dev/data/company_names.txt"
# file = "dev/data/unabridged_dictionary.txt"
# file = "dev/data/addresses.csv"
# measures = [CosineMeasure(), OverlapMeasure(), LeftOverlapMeasure()]
measures = [CosineMeasure()]
dbs = [DictDatabase,DiskDatabase, RedisDatabase]
# dbs = [DiskDatabase]
for db_cls in dbs:
output_similar_strings_of_each_line(file, measures, db_cls)

# for db_cls in [DictDatabase,DiskDatabase]:
# output_similar_strings_of_each_line("dev/data/unabridged_dictionary2.txt", measures, db_cls)
13 changes: 9 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ classifiers = [
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: Implementation :: CPython",
]
dependencies = []
dependencies = ["diskcache==5.6.1", "fakeredis", "redis"]
dynamic = ["version"]

[project.urls]
Expand All @@ -45,13 +45,18 @@ mypy-args = [
"--check-untyped-defs",
"--install-types"
]
exclude = [
"simstring/database/disk.py",
"simstring/database/redis.py",
"simstring/database/base.py",
]

[tool.hatch.envs.test]
dependencies = [
"pytest",
"pytest-cov",
"build",
"cython"
"cython==3.0.0"
]

[tool.hatch.envs.default.scripts]
Expand All @@ -61,7 +66,7 @@ no-cov = "cov --no-cov {args}"
build = "python -m build"

[[tool.hatch.envs.test.matrix]]
python = [ "38", "39", "310", "311"]
python = ["38","39", "310", "311"]

[tool.coverage.run]
branch = true
Expand Down Expand Up @@ -94,7 +99,7 @@ serve = "cd simstring && mkdocs serve --dev-addr localhost:8000"

[tool.hatch.envs.benchmark]
dependencies = [
"pyinstrument", "benchmarker" , "numpy"
"pyinstrument", "benchmarker" , "numpy", "tqdm"
]
[[tool.hatch.envs.benchmark.matrix]]
python = [ "38", "39", "310", "311"]
Expand Down
3 changes: 3 additions & 0 deletions simstring/database/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@ def max_feature_size(self):

def lookup_strings_by_feature_set_size_and_feature(self, size, feature):
raise NotImplementedError

def commit(self):
pass
102 changes: 102 additions & 0 deletions simstring/database/disk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@

from typing import List, Set, Dict, Union
from .base import BaseDatabase
from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
from simstring.feature_extractor.word_ngram import WordNgramFeatureExtractor

from io import BufferedWriter
import diskcache as dc
from functools import lru_cache

import os

FeatureExtractor = Union[
CharacterNgramFeatureExtractor, WordNgramFeatureExtractor
]

class DiskDatabase(BaseDatabase):
def __init__(
self,
feature_extractor: FeatureExtractor,
path:str= 'tmp'
):
self.feature_extractor = feature_extractor
self.feature_set_size_to_string_map: dc.Cache = dc.Cache(os.path.join(path,'feature_set_size_to_string_map'))
self.feature_set_size_and_feature_to_string_map: dc.Cache = dc.Cache(os.path.join(path,'feature_set_size_and_feature_to_string_map'))
self._min_feature_size = 9999999
self._max_feature_size = 0
self.path = path

@staticmethod
def _make_key(size: int, feature: str) -> str:
return f"{size}-{feature}"

def add_feature_set_size_and_feature_to_string_map(self, size, feature, string)-> None:
key = self._make_key(size,feature)
if key in self.feature_set_size_and_feature_to_string_map:
d = self.feature_set_size_and_feature_to_string_map[key]
if string in d:
return
else:
d = set()
d.add(string)
self.feature_set_size_and_feature_to_string_map[key] = d

def get_feature_set_size_and_feature_to_string_map(self, size: int, feature: str
) -> Set[str]:
try:
return self.feature_set_size_and_feature_to_string_map[self._make_key(size,feature)]
except KeyError:
return set()

def commit(self):
pass

def add(self, string: str) -> None:
features, size = self._process_string(string)

for feature in features:
self.add_feature_set_size_and_feature_to_string_map(size, feature, string)

def fast_add(self, string: str) -> None:
features, size = self._process_string(string)

for feature in features:
self.add_feature_set_size_and_feature_to_string_map(size, feature, string)

def _process_string(self, string:str):
features = self.feature_extractor.features(string)
size = len(features)

if size not in self.feature_set_size_to_string_map:
size_to_string_map = set()
else:
size_to_string_map = self.feature_set_size_to_string_map[size]

size_to_string_map.add(string)
self.feature_set_size_to_string_map[size] = size_to_string_map


self._min_feature_size = min(self._min_feature_size, size)
self._max_feature_size = max(self._max_feature_size, size)
return features,size

def all(self) -> List[str]:
strings = []
for k in self.feature_set_size_to_string_map.iterkeys():
strings.extend(self.feature_set_size_to_string_map[k])
return strings

def lookup_strings_by_feature_set_size_and_feature(
self, size: int, feature: str
) -> Set[str]:
return self.get_feature_set_size_and_feature_to_string_map(size,feature)

def min_feature_size(self) -> int:
return self._min_feature_size

def max_feature_size(self) -> int:
return self._max_feature_size



65 changes: 65 additions & 0 deletions simstring/database/redis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@

from typing import List, Set, Dict, Union
from .base import BaseDatabase
from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
from simstring.feature_extractor.word_ngram import WordNgramFeatureExtractor

from io import BufferedWriter
from redis import Redis
from fakeredis import FakeRedis
from functools import lru_cache

import os

FeatureExtractor = Union[
CharacterNgramFeatureExtractor, WordNgramFeatureExtractor
]

class RedisDatabase(BaseDatabase):
def __init__(
self,
feature_extractor: FeatureExtractor,
redis_connection: Union[Redis,FakeRedis] = FakeRedis

):
self.feature_extractor = feature_extractor
self.feature_set_size_to_string_map = redis_connection(db=0, decode_responses=True)
self.feature_set_size_and_feature_to_string_map = redis_connection(db=1, decode_responses=True)
self._min_feature_size = 9999999
self._max_feature_size = 0


@staticmethod
def _make_key(size: int, feature: str) -> str:
return f"{size}-{feature}"

def add(self, string: str) -> None:
features = self.feature_extractor.features(string)
size = len(features)
self.feature_set_size_to_string_map.sadd(size, string)

self._min_feature_size = min(self._min_feature_size, size)
self._max_feature_size = max(self._max_feature_size, size)

for feature in features:
self.feature_set_size_and_feature_to_string_map.sadd(self._make_key(size, feature), string)

def all(self) -> List[str]:
strings = []
for k in self.feature_set_size_to_string_map.keys():
strings.extend(self.feature_set_size_to_string_map.smembers(k))
return strings

def lookup_strings_by_feature_set_size_and_feature(
self, size: int, feature: str
) -> Set[str]:
return self.feature_set_size_and_feature_to_string_map.smembers(self._make_key(size, feature))

def min_feature_size(self) -> int:
return self._min_feature_size

def max_feature_size(self) -> int:
return self._max_feature_size



11 changes: 6 additions & 5 deletions tests/database/test_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,14 @@ def setUp(self):
self.db.add(string)

def test_strings(self):
self.assertEqual(self.db.strings, self.strings)
self.assertEqual(sorted(self.db.all()), sorted(self.strings))

# def test_min_feature_size(self):
# self.assertEqual(self.db.min_feature_size(), min(map(lambda x: len(x) + 1, self.strings)))

# def test_max_feature_size(self):
# self.assertEqual(self.db.max_feature_size(), max(map(lambda x: len(x) + 1, self.strings)))
def test_min_feature_size(self):
self.assertEqual(self.db.min_feature_size(), 2)

def test_max_feature_size(self):
self.assertEqual(self.db.max_feature_size(), 6)

def test_lookup_strings_by_feature_set_size_and_feature(self):
self.assertEqual(
Expand Down
Loading