diff --git a/.gitignore b/.gitignore index cf56c1a..e42741d 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,6 @@ dev/data/geo_address.csv dev/geo_matching.py *.so .coverage -simstring/site/ \ No newline at end of file +simstring/site/ +tmp*/ +addresses.csv \ No newline at end of file diff --git a/dev/company_names.py b/dev/company_names.py index 06a47b7..5f8691c 100644 --- a/dev/company_names.py +++ b/dev/company_names.py @@ -1,56 +1,61 @@ # coding: utf-8 - -import os, sys - -import numpy as np - from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor -from simstring.measure.cosine import ( - CosineMeasure, -) # , OverlapMeasure, LeftOverlapMeasure +from simstring.measure.cosine import CosineMeasure +from simstring.measure.overlap import OverlapMeasure, LeftOverlapMeasure -# from simstring.database.mongo import MongoDatabase from simstring.database.dict import DictDatabase +from simstring.database.disk import DiskDatabase +from simstring.database.redis import RedisDatabase from simstring.searcher import Searcher +from tqdm import tqdm from pyinstrument import Profiler profiler = Profiler() -def output_similar_strings_of_each_line(path, measure): +def output_similar_strings_of_each_line(path, measures, db_cls): strings = [] with open(path, "r") as lines: for line in lines: - strings.append(line.rstrip("\r\n")) - - db = DictDatabase(CharacterNgramFeatureExtractor(2)) - for string in strings: + strings.append(line.rstrip("\r\n").strip().lower()) + + db = make_db(db_cls, strings) + + for measure in measures: + searcher = Searcher(db, measure) + profiler.start() + + for string in strings: + result = searcher.search(string, 0.8) + + profiler.stop() + print(result) + print(db_cls.__name__, measure.__class__.__name__) + profiler.print() + +def make_db(db_cls, strings): + db = db_cls(CharacterNgramFeatureExtractor(2)) + i = 0 + for string in tqdm(strings): db.add(string) - - # db.save("companies.db") - - # dbl = DictDatabase.load("companies.db") - - searcher = Searcher(db, measure) - profiler.start() - - for string in strings: - result = searcher.search(string, 0.8) - # result = [str(np.round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(string, 0.8)] - # print("\t".join([string, ",".join(result)])) - - profiler.stop() - print(result) - profiler.print() - # profiler.open_in_browser() - - -measure = CosineMeasure() -output_similar_strings_of_each_line("dev/data/company_names.txt", measure) - -# measure = OverlapMeasure() -# output_similar_strings_of_each_line("dev/data/company_names.txt", measure) - -# measure = LeftOverlapMeasure() -# output_similar_strings_of_each_line("./data/company_names.txt", measure) + i += 1 + if (i % 10000) == 0: + db.commit() + i = 0 + db.commit() + return db + +if __name__ =="__main__": + file = "dev/data/company_names.txt" + # file = "dev/data/unabridged_dictionary.txt" + # file = "dev/data/addresses.csv" + # measures = [CosineMeasure(), OverlapMeasure(), LeftOverlapMeasure()] + measures = [CosineMeasure()] + dbs = [DictDatabase,DiskDatabase, RedisDatabase] + # dbs = [DiskDatabase] + for db_cls in dbs: + output_similar_strings_of_each_line(file, measures, db_cls) + + # for db_cls in [DictDatabase,DiskDatabase]: + # output_similar_strings_of_each_line("dev/data/unabridged_dictionary2.txt", measures, db_cls) diff --git a/pyproject.toml b/pyproject.toml index 2df8d80..32b2df3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: Implementation :: CPython", ] -dependencies = [] +dependencies = ["diskcache==5.6.1", "fakeredis", "redis"] dynamic = ["version"] [project.urls] @@ -45,13 +45,18 @@ mypy-args = [ "--check-untyped-defs", "--install-types" ] +exclude = [ + "simstring/database/disk.py", + "simstring/database/redis.py", + "simstring/database/base.py", +] [tool.hatch.envs.test] dependencies = [ "pytest", "pytest-cov", "build", - "cython" + "cython==3.0.0" ] [tool.hatch.envs.default.scripts] @@ -61,7 +66,7 @@ no-cov = "cov --no-cov {args}" build = "python -m build" [[tool.hatch.envs.test.matrix]] -python = [ "38", "39", "310", "311"] +python = ["38","39", "310", "311"] [tool.coverage.run] branch = true @@ -94,7 +99,7 @@ serve = "cd simstring && mkdocs serve --dev-addr localhost:8000" [tool.hatch.envs.benchmark] dependencies = [ - "pyinstrument", "benchmarker" , "numpy" + "pyinstrument", "benchmarker" , "numpy", "tqdm" ] [[tool.hatch.envs.benchmark.matrix]] python = [ "38", "39", "310", "311"] diff --git a/simstring/database/base.py b/simstring/database/base.py index 7210fe8..f8c16b1 100644 --- a/simstring/database/base.py +++ b/simstring/database/base.py @@ -13,3 +13,6 @@ def max_feature_size(self): def lookup_strings_by_feature_set_size_and_feature(self, size, feature): raise NotImplementedError + + def commit(self): + pass \ No newline at end of file diff --git a/simstring/database/disk.py b/simstring/database/disk.py new file mode 100644 index 0000000..f993348 --- /dev/null +++ b/simstring/database/disk.py @@ -0,0 +1,102 @@ + +from typing import List, Set, Dict, Union +from .base import BaseDatabase +from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor +from simstring.feature_extractor.word_ngram import WordNgramFeatureExtractor + +from io import BufferedWriter +import diskcache as dc +from functools import lru_cache + +import os + +FeatureExtractor = Union[ + CharacterNgramFeatureExtractor, WordNgramFeatureExtractor + ] + +class DiskDatabase(BaseDatabase): + def __init__( + self, + feature_extractor: FeatureExtractor, + path:str= 'tmp' + ): + self.feature_extractor = feature_extractor + self.feature_set_size_to_string_map: dc.Cache = dc.Cache(os.path.join(path,'feature_set_size_to_string_map')) + self.feature_set_size_and_feature_to_string_map: dc.Cache = dc.Cache(os.path.join(path,'feature_set_size_and_feature_to_string_map')) + self._min_feature_size = 9999999 + self._max_feature_size = 0 + self.path = path + + @staticmethod + def _make_key(size: int, feature: str) -> str: + return f"{size}-{feature}" + + def add_feature_set_size_and_feature_to_string_map(self, size, feature, string)-> None: + key = self._make_key(size,feature) + if key in self.feature_set_size_and_feature_to_string_map: + d = self.feature_set_size_and_feature_to_string_map[key] + if string in d: + return + else: + d = set() + d.add(string) + self.feature_set_size_and_feature_to_string_map[key] = d + + def get_feature_set_size_and_feature_to_string_map(self, size: int, feature: str + ) -> Set[str]: + try: + return self.feature_set_size_and_feature_to_string_map[self._make_key(size,feature)] + except KeyError: + return set() + + def commit(self): + pass + + def add(self, string: str) -> None: + features, size = self._process_string(string) + + for feature in features: + self.add_feature_set_size_and_feature_to_string_map(size, feature, string) + + def fast_add(self, string: str) -> None: + features, size = self._process_string(string) + + for feature in features: + self.add_feature_set_size_and_feature_to_string_map(size, feature, string) + + def _process_string(self, string:str): + features = self.feature_extractor.features(string) + size = len(features) + + if size not in self.feature_set_size_to_string_map: + size_to_string_map = set() + else: + size_to_string_map = self.feature_set_size_to_string_map[size] + + size_to_string_map.add(string) + self.feature_set_size_to_string_map[size] = size_to_string_map + + + self._min_feature_size = min(self._min_feature_size, size) + self._max_feature_size = max(self._max_feature_size, size) + return features,size + + def all(self) -> List[str]: + strings = [] + for k in self.feature_set_size_to_string_map.iterkeys(): + strings.extend(self.feature_set_size_to_string_map[k]) + return strings + + def lookup_strings_by_feature_set_size_and_feature( + self, size: int, feature: str + ) -> Set[str]: + return self.get_feature_set_size_and_feature_to_string_map(size,feature) + + def min_feature_size(self) -> int: + return self._min_feature_size + + def max_feature_size(self) -> int: + return self._max_feature_size + + + diff --git a/simstring/database/redis.py b/simstring/database/redis.py new file mode 100644 index 0000000..416f2ba --- /dev/null +++ b/simstring/database/redis.py @@ -0,0 +1,65 @@ + +from typing import List, Set, Dict, Union +from .base import BaseDatabase +from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor +from simstring.feature_extractor.word_ngram import WordNgramFeatureExtractor + +from io import BufferedWriter +from redis import Redis +from fakeredis import FakeRedis +from functools import lru_cache + +import os + +FeatureExtractor = Union[ + CharacterNgramFeatureExtractor, WordNgramFeatureExtractor + ] + +class RedisDatabase(BaseDatabase): + def __init__( + self, + feature_extractor: FeatureExtractor, + redis_connection: Union[Redis,FakeRedis] = FakeRedis + + ): + self.feature_extractor = feature_extractor + self.feature_set_size_to_string_map = redis_connection(db=0, decode_responses=True) + self.feature_set_size_and_feature_to_string_map = redis_connection(db=1, decode_responses=True) + self._min_feature_size = 9999999 + self._max_feature_size = 0 + + + @staticmethod + def _make_key(size: int, feature: str) -> str: + return f"{size}-{feature}" + + def add(self, string: str) -> None: + features = self.feature_extractor.features(string) + size = len(features) + self.feature_set_size_to_string_map.sadd(size, string) + + self._min_feature_size = min(self._min_feature_size, size) + self._max_feature_size = max(self._max_feature_size, size) + + for feature in features: + self.feature_set_size_and_feature_to_string_map.sadd(self._make_key(size, feature), string) + + def all(self) -> List[str]: + strings = [] + for k in self.feature_set_size_to_string_map.keys(): + strings.extend(self.feature_set_size_to_string_map.smembers(k)) + return strings + + def lookup_strings_by_feature_set_size_and_feature( + self, size: int, feature: str + ) -> Set[str]: + return self.feature_set_size_and_feature_to_string_map.smembers(self._make_key(size, feature)) + + def min_feature_size(self) -> int: + return self._min_feature_size + + def max_feature_size(self) -> int: + return self._max_feature_size + + + diff --git a/tests/database/test_dict.py b/tests/database/test_dict.py index 7c85234..4c94bb5 100644 --- a/tests/database/test_dict.py +++ b/tests/database/test_dict.py @@ -16,13 +16,14 @@ def setUp(self): self.db.add(string) def test_strings(self): - self.assertEqual(self.db.strings, self.strings) + self.assertEqual(sorted(self.db.all()), sorted(self.strings)) - # def test_min_feature_size(self): - # self.assertEqual(self.db.min_feature_size(), min(map(lambda x: len(x) + 1, self.strings))) - # def test_max_feature_size(self): - # self.assertEqual(self.db.max_feature_size(), max(map(lambda x: len(x) + 1, self.strings))) + def test_min_feature_size(self): + self.assertEqual(self.db.min_feature_size(), 2) + + def test_max_feature_size(self): + self.assertEqual(self.db.max_feature_size(), 6) def test_lookup_strings_by_feature_set_size_and_feature(self): self.assertEqual( diff --git a/tests/database/test_disk.py b/tests/database/test_disk.py new file mode 100644 index 0000000..c163093 --- /dev/null +++ b/tests/database/test_disk.py @@ -0,0 +1,69 @@ +# -*- coding:utf-8 -*- + +from unittest import TestCase +from simstring.database.disk import DiskDatabase +from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor +import pickle +import os +import shutil + + +class TestDisk(TestCase): + strings = ["a", "ab", "abc", "abcd", "abcde"] + + def setUp(self): + self.db = DiskDatabase(CharacterNgramFeatureExtractor(2), path="tmp_db_for_tests") + for string in self.strings: + self.db.add(string) + + + def tearDown(self) -> None: + shutil.rmtree(self.db.path) + return super().tearDown() + + def test_strings(self): + self.assertEqual(sorted(self.db.all()), sorted(self.strings)) + + def test_min_feature_size(self): + self.assertEqual(self.db.min_feature_size(), 2) + + def test_max_feature_size(self): + self.assertEqual(self.db.max_feature_size(), 6) + + def test_lookup_strings_by_feature_set_size_and_feature(self): + self.assertEqual( + self.db.lookup_strings_by_feature_set_size_and_feature(4, "ab_1"), + set(["abc"]), + ) + self.assertEqual( + self.db.lookup_strings_by_feature_set_size_and_feature(3, "ab_1"), + set(["ab"]), + ) + self.assertEqual( + self.db.lookup_strings_by_feature_set_size_and_feature(2, "ab_1"), set([]) + ) + + def test_load_from_folder(self): + + with open("test.pkl", "wb") as f: + pickle.dump(self.db, f) + + + with open("test.pkl", "rb") as f: + new = pickle.load(f) + + self.assertEqual(self.db._min_feature_size, new._min_feature_size) + self.assertEqual(self.db._max_feature_size, new._max_feature_size) + self.assertEqual( + self.db.feature_extractor.__class__, new.feature_extractor.__class__ + ) + self.assertEqual(self.db.feature_extractor.n, new.feature_extractor.n) + self.assertEqual( + set(self.db.feature_set_size_to_string_map.iterkeys()), set(new.feature_set_size_to_string_map.iterkeys()) + ) + self.assertEqual( + set(self.db.feature_set_size_and_feature_to_string_map.iterkeys()), + set(new.feature_set_size_and_feature_to_string_map.iterkeys()), + ) + + os.remove("test.pkl") diff --git a/tests/database/test_redis.py b/tests/database/test_redis.py new file mode 100644 index 0000000..7efeeca --- /dev/null +++ b/tests/database/test_redis.py @@ -0,0 +1,63 @@ +# -*- coding:utf-8 -*- + +from unittest import TestCase +from simstring.database.redis import RedisDatabase +from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor +import pickle +import os +import shutil +from fakeredis import FakeRedis + +class TestRedis(TestCase): + strings = ["a", "ab", "abc", "abcd", "abcde"] + + def setUp(self): + self.db = RedisDatabase(CharacterNgramFeatureExtractor(2), redis_connection=FakeRedis) + for string in self.strings: + self.db.add(string) + + def test_strings(self): + self.assertEqual(sorted(self.db.all()), sorted(self.strings)) + + def test_min_feature_size(self): + self.assertEqual(self.db.min_feature_size(), 2) + + def test_max_feature_size(self): + self.assertEqual(self.db.max_feature_size(), 6) + + def test_lookup_strings_by_feature_set_size_and_feature(self): + self.assertEqual( + self.db.lookup_strings_by_feature_set_size_and_feature(4, "ab_1"), + set(["abc"]), + ) + self.assertEqual( + self.db.lookup_strings_by_feature_set_size_and_feature(3, "ab_1"), + set(["ab"]), + ) + self.assertEqual( + self.db.lookup_strings_by_feature_set_size_and_feature(2, "ab_1"), set([]) + ) + + def test_load_from_folder(self): + with open("test.pkl", "wb") as f: + pickle.dump(self.db, f) + + + with open("test.pkl", "rb") as f: + new = pickle.load(f) + + self.assertEqual(self.db._min_feature_size, new._min_feature_size) + self.assertEqual(self.db._max_feature_size, new._max_feature_size) + self.assertEqual( + self.db.feature_extractor.__class__, new.feature_extractor.__class__ + ) + self.assertEqual(self.db.feature_extractor.n, new.feature_extractor.n) + self.assertEqual( + set(self.db.feature_set_size_to_string_map.iterkeys()), set(new.feature_set_size_to_string_map.iterkeys()) + ) + self.assertEqual( + set(self.db.feature_set_size_and_feature_to_string_map.iterkeys()), + set(new.feature_set_size_and_feature_to_string_map.iterkeys()), + ) + + os.remove("test.pkl")