import csv, re, string
from nltk.corpus import wordnet as wn
def normalize(textin):
return textin.lower()
connectors = [ 'any', 'anyone', 'as', 'by', 'if', 'make', 'may',
'onto', 'particular', 'possible', 'such', 'that',
'there', 'this', 'under', 'up', 'we', 'with',
'would', 'your', 'about', 'around', 'have', 'it',
'who', 'why', 'in', 'or', 'out', 'where', 'does',
'on', 'and', 'are', 'can', 'get', 'of', 'when', 'to',
'for', 'my', 'what', 'is', 'the', 'a', 'how', 'do', 'i',
'offered',
]
csvfile=csv.reader(open('FAQ_samples.csv'), dialect='excel')
kb=[]
for row in csvfile:
if len(row)>=5 and row[4]:
kb.append((row[0],row[4]))
vocab={}
for index,question in enumerate(x for x,y in kb):
x=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", question)
x=map(string.lower,x)
for word in set(x):
questions=vocab.setdefault(word,[])
questions.append(index)
vocab[word]=questions
vocabwords=vocab.keys()
for word in vocabwords:
if word in connectors:
del vocab[word]
vocabwords=vocab.keys()
synvocab={}
for word in vocabwords:
for synset in wn.synsets(word):
for lemma_name in synset.lemma_names:
if lemma_name not in synvocab:
synvocab[lemma_name]=[]
synvocab[lemma_name]=list(set(synvocab[lemma_name]).union(vocab[word]))
vocabwords=synvocab.keys()
for word in vocabwords:
if word in connectors:
del synvocab[word]
while True:
textin = normalize(raw_input("->"))
words=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", textin)
scores={}
max_score=0.0
syn_factor = 0.4
for word in words:
if word in vocab:
for question_index in vocab[word]:
s = scores.setdefault(question_index,0.0)
scores[question_index] = s + 1.0/float(len(vocab[word]))
max_score += 1.0/float(len(vocab[word]))
if word in synvocab:
for question_index in synvocab[word]:
s = scores.setdefault(question_index,0.0)
scores[question_index] = s + syn_factor/float(len(synvocab[word]))
max_score += syn_factor/float(len(synvocab[word]))
score,index = sorted([(v,k) for k,v in scores.iteritems()])[-1]
if max_score < 0.5:
print "Sorry, I didn't understand. "
print "Try asking your question a different way. "
print "it seems like you might be interested in the answer to this question: "
print kb[index][0]
print kb[index][1]