""" This module holds functions that are responsible for creating a new decision tree and for using the tree for data classificiation. """ def majority_value(data, target_attr): """ Creates a list of all values in the target attribute for each record in the data list object, and returns the value that appears in this list the most frequently. """ data = data[:] return most_frequent([record[target_attr] for record in data]) def most_frequent(lst): """ Returns the item that appears most frequently in the given list. """ lst = lst[:] highest_freq = 0 most_freq = None for val in unique(lst): if lst.count(val) > highest_freq: most_freq = val highest_freq = lst.count(val) return most_freq def unique(lst): """ Returns a list made up of the unique values found in lst. i.e., it removes the redundant values in lst. """ lst = lst[:] unique_lst = [] # Cycle through the list and add each value to the unique list only once. for item in lst: if unique_lst.count(item) <= 0: unique_lst.append(item) # Return the list with all redundant values removed. return unique_lst def get_values(data, attr): """ Creates a list of values in the chosen attribut for each record in data, prunes out all of the redundant values, and return the list. """ data = data[:] return unique([record[attr] for record in data]) def choose_attribute(data, attributes, target_attr, fitness): """ Cycles through all the attributes and returns the attribute with the highest information gain (or lowest entropy). """ data = data[:] best_gain = 0.0 best_attr = None for attr in attributes: gain = fitness(data, attr, target_attr) if (gain >= best_gain and attr != target_attr): best_gain = gain best_attr = attr return best_attr def get_examples(data, attr, value): """ Returns a list of all the records in with the value of matching the given value. """ data = data[:] rtn_lst = [] if not data: return rtn_lst else: record = data.pop() if record[attr] == value: rtn_lst.append(record) rtn_lst.extend(get_examples(data, attr, value)) return rtn_lst else: rtn_lst.extend(get_examples(data, attr, value)) return rtn_lst def get_classification(record, tree): """ This function recursively traverses the decision tree and returns a classification for the given record. """ # If the current node is a string, then we've reached a leaf node and # we can return it as our answer if type(tree) == type("string"): return tree # Traverse the tree further until a leaf node is found. else: attr = tree.keys()[0] print attr, tree[attr], record t = tree[attr][record[attr]] return get_classification(record, t) def classify(tree, data, default=None): """ Returns a list of classifications for each of the records in the data list as determined by the given decision tree. """ data = data[:] classification = [] for record in data: try: classification.append(get_classification(record, tree)) except: classification.append(default) return classification def create_decision_tree(data, attributes, target_attr, fitness_func): """ Returns a new decision tree based on the examples given. """ data = data[:] vals = [record[target_attr] for record in data] default = majority_value(data, target_attr) # If the dataset is empty or the attributes list is empty, return the # default value. When checking the attributes list for emptiness, we # need to subtract 1 to account for the target attribute. if not data or (len(attributes) - 1) <= 0: return default # If all the records in the dataset have the same classification, # return that classification. elif vals.count(vals[0]) == len(vals): return vals[0] else: # Choose the next best attribute to best classify our data best = choose_attribute(data, attributes, target_attr, fitness_func) # Create a new decision tree/node with the best attribute and an empty # dictionary object--we'll fill that up next. tree = {best:{}} # Create a new decision tree/sub-node for each of the values in the # best attribute field for val in get_values(data, best): # Create a subtree for the current value under the "best" field subtree = create_decision_tree( get_examples(data, best, val), [attr for attr in attributes if attr != best], target_attr, fitness_func) # Add the new subtree to the empty dictionary object in our new # tree/node we just created. tree[best][val] = subtree return tree