# This program analyzes a question/answer set to build a knowledge base and
# parsing routine for a conversational interface. In order to experiment with
# this, FOLLOW THESE INSTRUCTIONS:
#
# Create an excel spreadsheet CSV file with questions in column A and answers
# in column E. Name it FAQ_samples.csv.
#
# Download the NLTK Natural Language Tool Kit.
#
# If you don't want to do all that, then just look at the results program,
# which incorporates MY Questions and Answers, and the NLTK synonyms into it,
# hard-coded. It's a lot easier to play with than this is.
#
# Results program is located at: http://www.pastie.org/574701
#
# Copyright 2009, Jerry Felix.
#
# Use Freely for non-commercial use,
# but please send me an email at amyiris@amyiris.com so I can
# keep you up-to-date on developments!
import csv, re, string
from nltk.corpus import wordnet as wn
# TO DO: Improve normalization, by eliminating apostrophes, punctuation, etc.
def normalize(textin):
return textin.lower()
# These words are fairly meaningless to the scoring; feel free to expand or
# modify, so that it contains the meaningless words in your knowledge base
connectors = [ 'any', 'anyone', 'as', 'by', 'if', 'make', 'may',
'onto', 'particular', 'possible', 'such', 'that',
'there', 'this', 'under', 'up', 'we', 'with',
'would', 'your', 'about', 'around', 'have', 'it',
'who', 'why', 'in', 'or', 'out', 'where', 'does',
'on', 'and', 'are', 'can', 'get', 'of', 'when', 'to',
'for', 'my', 'what', 'is', 'the', 'a', 'how', 'do', 'i',
'offered',
]
# My csv file has questions in column A and answers in column E
csvfile=csv.reader(open('FAQ_samples.csv'), dialect='excel')
kb=[]
for row in csvfile:
if len(row)>=5 and row[4]:
kb.append((row[0],row[4]))
# kb is the Knowledge Base - a list of (questions, answers)
vocab={}
for index,question in enumerate(x for x,y in kb):
#print question
x=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", question)
x=map(string.lower,x)
for word in set(x):
questions=vocab.setdefault(word,[])
questions.append(index)
vocab[word]=questions
# vocab is a dict whose key is a word, and value is a list of question numbers
# that the word appears in. It ended up being about 200 words for my 55
# questions.
# Below, I delete any of the connector words. I'm sure this can be done in
# one line of Python code, not 4.... I leave that up to you; teach me!
vocabwords=vocab.keys()
for word in vocabwords:
if word in connectors:
del vocab[word]
# Now I'm going to create the synonyms based on NLTK's synsets
vocabwords=vocab.keys()
synvocab={}
for word in vocabwords:
for synset in wn.synsets(word):
for lemma_name in synset.lemma_names:
if lemma_name not in synvocab:
synvocab[lemma_name]=[]
synvocab[lemma_name]=list(set(synvocab[lemma_name]).union(vocab[word]))
# synvocab ended up being a list of about 1500 synonyms and related words.
# I delete the connector words out of the synonyms too.
vocabwords=synvocab.keys()
for word in vocabwords:
if word in connectors:
del synvocab[word]
# Here's my Conversational Interface
while True:
textin = normalize(raw_input("->"))
words=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", textin)
scores={} # a dictionary of scores indexed by question number
max_score=0.0 # for the user's input, what's the maximum score possible?
syn_factor = 0.4 # How much credit do synonyms get, vs. exact matches?
for word in words:
if word in vocab:
for question_index in vocab[word]:
s = scores.setdefault(question_index,0.0)
scores[question_index] = s + 1.0/float(len(vocab[word]))
max_score += 1.0/float(len(vocab[word]))
if word in synvocab:
for question_index in synvocab[word]:
s = scores.setdefault(question_index,0.0)
scores[question_index] = s + syn_factor/float(len(synvocab[word]))
max_score += syn_factor/float(len(synvocab[word]))
score,index = sorted([(v,k) for k,v in scores.iteritems()])[-1]
if max_score < 0.5: # this is my threshold of understanding
print "Sorry, I didn't understand. "
print "Try asking your question a different way. "
print "it seems like you might be interested in the answer to this question: "
print kb[index][0]
print kb[index][1]