[Tracker-discuss] [issue515] Full text search doesn't return results
shiyao.ma
metatracker at psf.upfronthosting.co.za
Fri Apr 3 23:57:41 CEST 2015
shiyao.ma added the comment:
This patch splits text in the following form,
suppose TEXT = "aa bb ee.ff.gg"
the splitted words will be: aa, bb, ee, ff, gg, ee.ff, ff.gg, ee.ff.gg
IOW, new words are connected by the dot.
When searching through web interface, for example if the text is "kk hh.pp", then the splitted words are "kk" and "hh.pp".
IOW, when searching, we take "dot" separated words as a whole.
the handling for csv interface, and the xapian based indexer is not modified. If the above form is okay, I will do the remaining stuff.
----------
nosy: +ezio.melotti
_______________________________________________________
PSF Meta Tracker <metatracker at psf.upfronthosting.co.za>
<http://psf.upfronthosting.co.za/roundup/meta/issue515>
_______________________________________________________
-------------- next part --------------
diff --git a/roundup/backends/indexer_common.py b/roundup/backends/indexer_common.py
--- a/roundup/backends/indexer_common.py
+++ b/roundup/backends/indexer_common.py
@@ -23,6 +23,28 @@
# gibberish (encoded text or somesuch) or shorter than 2 characters
self.minlength = 2
self.maxlength = 25
+ self.dot_maxlength = 10
+ self.dot_maxrepeat = 2
+ self.pattern_word = re.compile(
+ r'\b\w{%d,%d}\b'
+ % (self.minlength, self.maxlength),
+ re.UNICODE)
+ self.pattern_dot = re.compile(
+ r'\b(\w{1,%d}\.){1,%d}\w{1,%d}\b' %
+ (self.dot_maxlength, self.dot_maxrepeat, self.dot_maxlength),
+ re.UNICODE)
+
+ def segment_text(self, text):
+ wordlist = [w for w in re.findall(self.pattern_word, text)]
+ match = re.search(self.pattern_dot, text)
+ while match:
+ words = match.group().split('.')
+ for length in range(2, len(words)+1):
+ for idx in range(len(words)-length+1):
+ wordlist.append('.'.join(words[idx:idx+length]))
+ text = text[match.end():]
+ match = re.search(self.pattern_dot, text)
+ return set(w.encode('utf8') for w in wordlist)
def is_stopword(self, word):
return word in self.stopwords
diff --git a/roundup/backends/indexer_rdbms.py b/roundup/backends/indexer_rdbms.py
--- a/roundup/backends/indexer_rdbms.py
+++ b/roundup/backends/indexer_rdbms.py
@@ -64,9 +64,7 @@
if not isinstance(text, unicode):
text = unicode(text, "utf-8", "replace")
text = text.upper()
- wordlist = [w.encode("utf-8")
- for w in re.findall(r'(?u)\b\w{%d,%d}\b'
- % (self.minlength, self.maxlength), text)]
+ wordlist = self.segment_text(text)
words = set()
for word in wordlist:
if self.is_stopword(word): continue
diff --git a/roundup/cgi/templating.py b/roundup/cgi/templating.py
--- a/roundup/cgi/templating.py
+++ b/roundup/cgi/templating.py
@@ -2800,7 +2800,7 @@
if self.search_text:
matches = self.client.db.indexer.search(
[w.upper().encode("utf-8", "replace") for w in re.findall(
- r'(?u)\b\w{2,25}\b',
+ r'(?u)\b(?:\w|\.){2,40}\b',
unicode(self.search_text, "utf-8", "replace")
)], klass, ignore)
else:
More information about the Tracker-discuss
mailing list