Pastie now auto-senses if line-wrap is a bad or good idea. Feedback?
## mark a section (Learn more)
# This program analyzes a question/answer set to build a knowledge base and # parsing routine for a conversational interface. In order to experiment with # this, FOLLOW THESE INSTRUCTIONS: # # Create an excel spreadsheet CSV file with questions in column A and answers # in column E. Name it FAQ_samples.csv. # # Download the NLTK Natural Language Tool Kit. # # If you don't want to do all that, then just look at the results program, # which incorporates MY Questions and Answers, and the NLTK synonyms into it, # hard-coded. It's a lot easier to play with than this is. # # Results program is located at: http://www.pastie.org/574701 # # Copyright 2009, Jerry Felix. # # Use Freely for non-commercial use, # but please send me an email at amyiris@amyiris.com so I can # keep you up-to-date on developments! import csv, re, string from nltk.corpus import wordnet as wn # TO DO: Improve normalization, by eliminating apostrophes, punctuation, etc. def normalize(textin): return textin.lower() # These words are fairly meaningless to the scoring; feel free to expand or # modify, so that it contains the meaningless words in your knowledge base connectors = [ 'any', 'anyone', 'as', 'by', 'if', 'make', 'may', 'onto', 'particular', 'possible', 'such', 'that', 'there', 'this', 'under', 'up', 'we', 'with', 'would', 'your', 'about', 'around', 'have', 'it', 'who', 'why', 'in', 'or', 'out', 'where', 'does', 'on', 'and', 'are', 'can', 'get', 'of', 'when', 'to', 'for', 'my', 'what', 'is', 'the', 'a', 'how', 'do', 'i', 'offered', ] # My csv file has questions in column A and answers in column E csvfile=csv.reader(open('FAQ_samples.csv'), dialect='excel') kb=[] for row in csvfile: if len(row)>=5 and row[4]: kb.append((row[0],row[4])) # kb is the Knowledge Base - a list of (questions, answers) vocab={} for index,question in enumerate(x for x,y in kb): #print question x=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", question) x=map(string.lower,x) for word in set(x): questions=vocab.setdefault(word,[]) questions.append(index) vocab[word]=questions # vocab is a dict whose key is a word, and value is a list of question numbers # that the word appears in. It ended up being about 200 words for my 55 # questions. # Below, I delete any of the connector words. I'm sure this can be done in # one line of Python code, not 4.... I leave that up to you; teach me! vocabwords=vocab.keys() for word in vocabwords: if word in connectors: del vocab[word] # Now I'm going to create the synonyms based on NLTK's synsets vocabwords=vocab.keys() synvocab={} for word in vocabwords: for synset in wn.synsets(word): for lemma_name in synset.lemma_names: if lemma_name not in synvocab: synvocab[lemma_name]=[] synvocab[lemma_name]=list(set(synvocab[lemma_name]).union(vocab[word])) # synvocab ended up being a list of about 1500 synonyms and related words. # I delete the connector words out of the synonyms too. vocabwords=synvocab.keys() for word in vocabwords: if word in connectors: del synvocab[word] # Here's my Conversational Interface while True: textin = normalize(raw_input("->")) words=re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", textin) scores={} # a dictionary of scores indexed by question number max_score=0.0 # for the user's input, what's the maximum score possible? syn_factor = 0.4 # How much credit do synonyms get, vs. exact matches? for word in words: if word in vocab: for question_index in vocab[word]: s = scores.setdefault(question_index,0.0) scores[question_index] = s + 1.0/float(len(vocab[word])) max_score += 1.0/float(len(vocab[word])) if word in synvocab: for question_index in synvocab[word]: s = scores.setdefault(question_index,0.0) scores[question_index] = s + syn_factor/float(len(synvocab[word])) max_score += syn_factor/float(len(synvocab[word])) score,index = sorted([(v,k) for k,v in scores.iteritems()])[-1] if max_score < 0.5: # this is my threshold of understanding print "Sorry, I didn't understand. " print "Try asking your question a different way. " print "it seems like you might be interested in the answer to this question: " print kb[index][0] print kb[index][1]
This paste will be private.
From the Design Piracy series on my blog: