Wednesday, July 3, 2019

ML 3 - ID3 ALGORITHM

3. WRITE A PROGRAM TO DEMONSTRATE THE WORKING OF THE DECISION TREE BASED ID3 ALGORITHM. USE AN APPROPRIATE DATA SET FOR BUILDING THE DECISION TREE AND APPLY THIS KNOWLEDGE TO CLASSIFY A NEW SAMPLE.

 SOLUTION 1  ( with packages) (given by Lokesh sir)

tennisdata.csv


Outlook
Temperature
Humidity
Windy
PlayTennis
Sunny
Hot
High
FALSE
No
Sunny
Hot
High
TRUE
No
Overcast
Hot
High
FALSE
Yes
Rainy
Mild
High
FALSE
Yes
Rainy
Cool
Normal
FALSE
Yes
Rainy
Cool
Normal
TRUE
No
Overcast
Cool
Normal
TRUE
Yes
Sunny
Mild
High
FALSE
No
Sunny
Cool
Normal
FALSE
Yes
Rainy
Mild
Normal
FALSE
Yes
Sunny
Mild
Normal
TRUE
Yes
Overcast
Mild
High
TRUE
Yes
Overcast
Hot
Normal
FALSE
Yes
Rainy
Mild
High
TRUE
No

lab3.py

import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO


data = pd.read_csv('tennisdata.csv')
print("The first 5 values of data is \n",data.head())

X = data.iloc[:,:-1]
print("\nThe first 5 values of Train data is \n",X.head())
y = data.iloc[:,-1]
print("\nThe first 5 values of Train output is \n",y.head())


le_outlook = LabelEncoder()
X.Outlook =  le_outlook.fit_transform(X.Outlook)
le_Temperature = LabelEncoder()
X.Temperature =  le_Temperature.fit_transform(X.Temperature)
le_Humidity = LabelEncoder()
X.Humidity =  le_Humidity.fit_transform(X.Humidity)
le_Windy = LabelEncoder()
X.Windy =  le_Windy.fit_transform(X.Windy)

print("\nNow the Train data is",X.head())

le_PlayTennis = LabelEncoder()
y =  le_PlayTennis.fit_transform(y)
print("\nNow the Train data is\n",y)

classifier = DecisionTreeClassifier()
classifier.fit(X,y)

def labelEncoderForInput(list1):
    list1[0] =  le_outlook.transform([list1[0]])[0]
    list1[1] =  le_Temperature.transform([list1[1]])[0]
    list1[2] =  le_Humidity.transform([list1[2]])[0]
    list1[3] =  le_Windy.transform([list1[3]])[0]
    return [list1]

inp = ["Rainy","Mild","High","False"]
inp1=["Rainy","Cool","High","False"]
pred1 = labelEncoderForInput(inp1)
y_pred = classifier.predict(pred1)
y_pred
print("\nfor input {0}, we obtain {1}".format(inp1, le_PlayTennis.inverse_transform(y_pred[0])))

STEPS & OUTPUT:

to view steps & output click HERE

 SOLUTION  2   ( without packages)

tennisdata.csv


Outlook
Temperature
Humidity
Windy
PlayTennis
Sunny
Hot
High
FALSE
No
Sunny
Hot
High
TRUE
No
Overcast
Hot
High
FALSE
Yes
Rainy
Mild
High
FALSE
Yes
Rainy
Cool
Normal
FALSE
Yes
Rainy
Cool
Normal
TRUE
No
Overcast
Cool
Normal
TRUE
Yes
Sunny
Mild
High
FALSE
No
Sunny
Cool
Normal
FALSE
Yes
Rainy
Mild
Normal
FALSE
Yes
Sunny
Mild
Normal
TRUE
Yes
Overcast
Mild
High
TRUE
Yes
Overcast
Hot
Normal
FALSE
Yes
Rainy
Mild
High
TRUE
No

lab3.py

import numpy as np
import math
import csv

def read_data(filename):
    with open(filename, 'r') as csvfile:
        datareader = csv.reader(csvfile, delimiter=',')
        headers = next(datareader)
        metadata = []
        traindata = []
        for name in headers:
            metadata.append(name)
        for row in datareader:
            traindata.append(row)

    return (metadata, traindata)

class Node:
    def __init__(self, attribute):
        self.attribute = attribute
        self.children = []
        self.answer = ""
     
    def __str__(self):
        return self.attribute

def subtables(data, col, delete):
    dict = {}
    items = np.unique(data[:, col])
    count = np.zeros((items.shape[0], 1), dtype=np.int32) 
 
    for x in range(items.shape[0]):
        for y in range(data.shape[0]):
            if data[y, col] == items[x]:
                count[x] += 1
             
    for x in range(items.shape[0]):
        dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
        pos = 0
        for y in range(data.shape[0]):
            if data[y, col] == items[x]:
                dict[items[x]][pos] = data[y]
                pos += 1     
        if delete:
            dict[items[x]] = np.delete(dict[items[x]], col, 1)
     
    return items, dict

def entropy(S):
    items = np.unique(S)

    if items.size == 1:
        return 0
 
    counts = np.zeros((items.shape[0], 1))
    sums = 0
 
    for x in range(items.shape[0]):
        counts[x] = sum(S == items[x]) / (S.size * 1.0)

    for count in counts:
        sums += -1 * count * math.log(count, 2)
    return sums

def gain_ratio(data, col):
    items, dict = subtables(data, col, delete=False)
             
    total_size = data.shape[0]
    entropies = np.zeros((items.shape[0], 1))
    intrinsic = np.zeros((items.shape[0], 1))
 
    for x in range(items.shape[0]):
        ratio = dict[items[x]].shape[0]/(total_size * 1.0)
        entropies[x] = ratio * entropy(dict[items[x]][:, -1])
        intrinsic[x] = ratio * math.log(ratio, 2)
     
    total_entropy = entropy(data[:, -1])
    iv = -1 * sum(intrinsic)
 
    for x in range(entropies.shape[0]):
        total_entropy -= entropies[x]
     
    return total_entropy / iv

def create_node(data, metadata):
    if (np.unique(data[:, -1])).shape[0] == 1:
        node = Node("")
        node.answer = np.unique(data[:, -1])[0]
        return node
     
    gains = np.zeros((data.shape[1] - 1, 1))
 
    for col in range(data.shape[1] - 1):
        gains[col] = gain_ratio(data, col)
     
    split = np.argmax(gains)
 
    node = Node(metadata[split]) 
    metadata = np.delete(metadata, split, 0) 
 
    items, dict = subtables(data, split, delete=True)
 
    for x in range(items.shape[0]):
        child = create_node(dict[items[x]], metadata)
        node.children.append((items[x], child))
 
    return node

def empty(size):
    s = ""
    for x in range(size):
        s += "   "
    return s

def print_tree(node, level):
    if node.answer != "":
        print(empty(level), node.answer)
        return
    print(empty(level), node.attribute)
    for value, n in node.children:
        print(empty(level + 1), value)
        print_tree(n, level + 2)

metadata, traindata = read_data("tennisdata.csv")
data = np.array(traindata)
node = create_node(data, metadata)
print_tree(node, 0)


STEPS & OUTPUT:

to view steps & output click HERE

1 comment: