In: Computer Science
Solve using PYTHON PROGRAMMING
9. Write a script that reads a file “ai_trends.txt”, into a list of words, eliminates from the list of words the words in the file “stopwords_en.txt” and then
a. Calculates the average occurrence of the words. Occurrence is the number of times a word is appearing in the text
b. Calculates the longest word
c. Calculates the average word length. This is based on the unique words: each word counts as one
d. Create a bar chart with the 10 most frequent words.
Solve using PYTHON PROGRAMMING
import re
import operator
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
stopwords = []
#Read stop words from file
filename = "stopwords_en.txt"
with open(filename, 'r') as file:
content = file.readlines()
#Read line by line, split using comma, add to
list after stripping any extra space
for aLine in content:
words =
aLine.split(",")
for word in words:
if word not in stopwords:
stopwords.append(word.strip())
#print(stopwords)
textwords = []
#Read words from text file
filename = "ai_trends.txt"
with open(filename, 'r') as file:
content = file.readlines()
for aLine in content:
words = aLine.split("
")
for word in words:
#Remove punctuation from word using regex
res = re.sub(r'[^\w\s]', '', word)
textwords.append(res)
#print(textwords)
#Remove stop words from textwords
temp = []
for word in textwords:
if word not in stopwords:
temp.append(word)
textwords = temp
#Create dictionary of words with frequency
freq = dict()
for word in textwords:
if len(word)==0:
continue
if word in freq:
freq[word] += 1
else:
freq[word] = 1
#print(freq)
#Calculate average occurence
sum_ = 0
count = 0
for key,value in freq.items():
sum_ += freq[key]
count += 1
print("Average occurence: ", str(sum_/count))
#Calculate longest word
maximum = 0
max_word = ""
for key in freq.keys():
if len(key)>maximum:
maximum = len(key)
max_word = key
print(max_word, maximum)
#Average length of unique words after stopwords removal
sum_ = 0
count = 0
for key in freq.keys():
sum_ += len(key)
count += 1
print("Average word length of unique words:", str(sum_/count))
#Sort the dictionary to find 10 most frequent words
newDict = dict( sorted(freq.items(), key=operator.itemgetter(1),
reverse=True))
print(newDict)
#Create bar chart
i = 0
word = []
frequency = []
for key, value in newDict.items():
if i==10:
break;
else:
word.append(key)
frequency.append(value)
i += 1
#Set figure size
figure(num=None, figsize=(15,7), dpi=80)
plt.bar(word, frequency, color="orange")
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.show()