trying to improve my knn algorithm
hunter.hammond.dev at gmail.com
hunter.hammond.dev at gmail.com
Wed Jul 1 14:21:58 EDT 2020
This is a knn algorithm for articles that I have gotten. Then determines which category it belongs to. I am not getting very good results :/
k = 23
training_folder = './data/training/'
minn_folder = training_folder + 'Minnesota/'
health_folder = training_folder + 'Health/'
def remove_punctuation(text):
return regex.sub(r'\p{P}+', "", text)
def file_list(folder):
return [f for f in listdir(folder) if isfile(join(folder, f))]
def all_file_list():
minn_files = file_list(minn_folder)
for i in range(len(minn_files)):
minn_files[i] = minn_folder + minn_files[i]
health_files = file_list(health_folder)
for i in range(len(health_files)):
health_files[i] = health_folder + health_files[i]
return minn_files + health_files
def file_to_word_list(f):
fr = open(f, 'r')
text_read = fr.read()
text = remove_punctuation(text_read)
return text.split()
def get_vocabularies(all_files):
voc = {}
for f in all_files:
words = file_to_word_list(f)
for w in words:
voc[w] = 0
return voc
def load_training_data():
all_files = all_file_list()
voc = get_vocabularies(all_files)
training_data = []
for f in all_files:
tag = f.split('/')[3]
point = copy.deepcopy(voc)
words = file_to_word_list(f)
for w in words:
point[w] += 1
d = {'tag': tag, 'point': point}
training_data.append(d)
return training_data
def get_distance(p1, p2):
sq_sum = 0
for w in p1:
if w in p2:
sq_sum += pow(p1[w] - p2[w], 2)
return math.sqrt(sq_sum)
# This function is implemented for seeing insights of training data
def show_distances(training_data):
for i in range(len(training_data)):
for j in range(i + 1, len(training_data)):
print('d(' + str(i) + ',' + str(j) + ')=')
print(get_distance(training_data[i]['point'], training_data[j]['point']))
print()
for i in range(len(training_data)):
print(training_data[i]['tag'])
def test(training_data, txt_file):
dist_list = []
txt = {}
item = {}
max_i = 0
words = file_to_word_list(txt_file)
for w in words:
if w in txt:
txt[w] += 1
else:
txt[w] = 1
for pt in training_data:
item['tag'] = pt['tag']
item['distance'] = get_distance(pt['point'], txt)
if len(dist_list) < k:
dist_list.append(copy.deepcopy(item))
else:
for i in range(1, k):
if dist_list[i]['distance'] > dist_list[max_i]['distance']:
max_i = i
if dist_list[max_i]['distance'] > item['distance']:
dist_list[max_i] = item
vote_result = {}
for d in dist_list:
if d['tag'] in vote_result:
vote_result[d['tag']] += 1
else:
vote_result[d['tag']] = 1
# print(vote_result) # for testing
result = dist_list[0]['tag']
for vote in vote_result:
if vote_result[vote] > vote_result[result]:
result = vote
return result
def main(txt):
td = load_training_data()
print(show_distances(td))
# show_distances(td) # for test usage only
print('Category: ' + test(td, txt))
if __name__ == '__main__':
main(sys.argv[1])
More information about the Python-list
mailing list