Demo: Text Vectorization
class Course:
def __init__(self, course_id):
# create a new course with an ID
self.id = course_id
# create an empty dictionary
# to hold term frequency (TF) counts
self.tfs = {}
def count_words(self, text):
# split a title into words,
# using space " " as delimiter
words = text.lower().split(" ")
for word in words:
# for each word in the list
if word in self.tfs:
# if it has been counted in the TF dictionary
# add 1 to the count
self.tfs[word] = self.tfs[word] + 1
else:
# if it has not been counted,
# initialize its TF with 1
self.tfs[word] = 1
def print_dictionary(dict_data):
# print all key-value pairs in a dictionary
for key in dict_data:
print(key, dict_data[key])
courses = []
titles = ["Information and System", "Data and Information", "System and System Programming"]
for i in range(0, len(titles)):
title = titles[i]
# create a new course with an ID
course = Course(i+1)
# process title and compute term frequencies (TF)
course.count_words(title)
# add the course to the list
courses.append(course)
dfs = {}
for course in courses:
for word in course.tfs:
# add 1 to DF count if the word appears in a doc (TF)
dfs[word] = dfs.get(word,0) + 1
print("DF statistics: ")
print("==============")
print_dictionary(dfs)
print("TF statistics: ")
print("==============")
for course in courses:
print("COURSE #{0}".format(course.id))
print("-------------")
print_dictionary(course.tfs)
print("")