From: zharkovstas Date: Sat, 18 May 2019 23:26:10 +0000 (+0500) Subject: add stemmer for streets X-Git-Url: https://git.xn--bdkaa.com/?a=commitdiff_plain;h=29d7b476547513500d815e6e378c1f635e30557c;p=where-are-you.py.git add stemmer for streets --- diff --git a/app.py b/app.py index 5b1d710..2ac3993 100644 --- a/app.py +++ b/app.py @@ -9,9 +9,11 @@ from string_formats import choose_numeral_form from osm.osm import describe_objects from yandex import get_text_by_coordinates +from stemmer import stemming import itertools import bottle +import re cities = { "Екатеринбург": (56.807556, 56.847826, 60.570744, 60.657791), @@ -72,7 +74,7 @@ def add_tips(game): success, summary = parse_summary( s['name'].replace('улица', '').replace('проспект', '').replace('переулок', '').strip()) - if success and 'улица' not in summary: + if success and 'улица' not in summary and not re.search(stemming(s['name']), stemming(summary), re.IGNORECASE): game.tips.append(f'{summary.capitalize()}. Это как-то связано с названием ближайшей улицы 🤔') buildings = near_objects['buildings'] diff --git a/stemmer.py b/stemmer.py new file mode 100644 index 0000000..0f473a6 --- /dev/null +++ b/stemmer.py @@ -0,0 +1,26 @@ +import snowballstemmer +stemmer = snowballstemmer.stemmer('russian'); +import sys, re +bad_characters_regex = re.compile(r'[^a-zа-я0-9-+,.;"\'()/!?\s]') +punctuation_regex = re.compile(r'[-+,.;"\'()/!?]') +many_spaces_regex = re.compile(r' +') + +def normalize(line): + result = line.lower() + result = result.replace('ё', 'е') + result = re.sub(punctuation_regex, ' \g<0> ', result) + result = re.sub(bad_characters_regex, ' ', result) + result = re.sub(many_spaces_regex, ' ', result) + + return result.strip() + + +def stemming(word): + return stemmer.stemWord(word) + +def text_stemming(text): + normilized_text = normalize(text) + return ' '.join(stemmer.stemWords(normilized_text.split())) + + +print(text_stemming('Народной воли')) \ No newline at end of file