From 29d7b476547513500d815e6e378c1f635e30557c Mon Sep 17 00:00:00 2001 From: zharkovstas Date: Sun, 19 May 2019 04:26:10 +0500 Subject: [PATCH] add stemmer for streets --- app.py | 4 +++- stemmer.py | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 stemmer.py diff --git a/app.py b/app.py index 5b1d710..2ac3993 100644 --- a/app.py +++ b/app.py @@ -9,9 +9,11 @@ from string_formats import choose_numeral_form from osm.osm import describe_objects from yandex import get_text_by_coordinates +from stemmer import stemming import itertools import bottle +import re cities = { "Екатеринбург": (56.807556, 56.847826, 60.570744, 60.657791), @@ -72,7 +74,7 @@ def add_tips(game): success, summary = parse_summary( s['name'].replace('улица', '').replace('проспект', '').replace('переулок', '').strip()) - if success and 'улица' not in summary: + if success and 'улица' not in summary and not re.search(stemming(s['name']), stemming(summary), re.IGNORECASE): game.tips.append(f'{summary.capitalize()}. Это как-то связано с названием ближайшей улицы 🤔') buildings = near_objects['buildings'] diff --git a/stemmer.py b/stemmer.py new file mode 100644 index 0000000..0f473a6 --- /dev/null +++ b/stemmer.py @@ -0,0 +1,26 @@ +import snowballstemmer +stemmer = snowballstemmer.stemmer('russian'); +import sys, re +bad_characters_regex = re.compile(r'[^a-zа-я0-9-+,.;"\'()/!?\s]') +punctuation_regex = re.compile(r'[-+,.;"\'()/!?]') +many_spaces_regex = re.compile(r' +') + +def normalize(line): + result = line.lower() + result = result.replace('ё', 'е') + result = re.sub(punctuation_regex, ' \g<0> ', result) + result = re.sub(bad_characters_regex, ' ', result) + result = re.sub(many_spaces_regex, ' ', result) + + return result.strip() + + +def stemming(word): + return stemmer.stemWord(word) + +def text_stemming(text): + normilized_text = normalize(text) + return ' '.join(stemmer.stemWords(normilized_text.split())) + + +print(text_stemming('Народной воли')) \ No newline at end of file -- 2.50.1