3. WRITE A PROGRAM TO DEMONSTRATE THE WORKING OF THE DECISION TREE BASED ID3 ALGORITHM. USE AN APPROPRIATE DATA SET FOR BUILDING THE DECISION TREE AND APPLY THIS KNOWLEDGE TO CLASSIFY A NEW SAMPLE.
SOLUTION 1 ( with packages) (given by Lokesh sir)
tennisdata.csv
lab3.py
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO
data = pd.read_csv('tennisdata.csv')
print("The first 5 values of data is \n",data.head())
X = data.iloc[:,:-1]
print("\nThe first 5 values of Train data is \n",X.head())
y = data.iloc[:,-1]
print("\nThe first 5 values of Train output is \n",y.head())
le_outlook = LabelEncoder()
X.Outlook = le_outlook.fit_transform(X.Outlook)
le_Temperature = LabelEncoder()
X.Temperature = le_Temperature.fit_transform(X.Temperature)
le_Humidity = LabelEncoder()
X.Humidity = le_Humidity.fit_transform(X.Humidity)
le_Windy = LabelEncoder()
X.Windy = le_Windy.fit_transform(X.Windy)
print("\nNow the Train data is",X.head())
le_PlayTennis = LabelEncoder()
y = le_PlayTennis.fit_transform(y)
print("\nNow the Train data is\n",y)
classifier = DecisionTreeClassifier()
classifier.fit(X,y)
def labelEncoderForInput(list1):
list1[0] = le_outlook.transform([list1[0]])[0]
list1[1] = le_Temperature.transform([list1[1]])[0]
list1[2] = le_Humidity.transform([list1[2]])[0]
list1[3] = le_Windy.transform([list1[3]])[0]
return [list1]
inp = ["Rainy","Mild","High","False"]
inp1=["Rainy","Cool","High","False"]
pred1 = labelEncoderForInput(inp1)
y_pred = classifier.predict(pred1)
y_pred
print("\nfor input {0}, we obtain {1}".format(inp1, le_PlayTennis.inverse_transform(y_pred[0])))
STEPS & OUTPUT:
to view steps & output click HERE
SOLUTION 2 ( without packages)
tennisdata.csv
lab3.py
import numpy as np
import math
import csv
def read_data(filename):
with open(filename, 'r') as csvfile:
datareader = csv.reader(csvfile, delimiter=',')
headers = next(datareader)
metadata = []
traindata = []
for name in headers:
metadata.append(name)
for row in datareader:
traindata.append(row)
return (metadata, traindata)
class Node:
def __init__(self, attribute):
self.attribute = attribute
self.children = []
self.answer = ""
def __str__(self):
return self.attribute
def subtables(data, col, delete):
dict = {}
items = np.unique(data[:, col])
count = np.zeros((items.shape[0], 1), dtype=np.int32)
for x in range(items.shape[0]):
for y in range(data.shape[0]):
if data[y, col] == items[x]:
count[x] += 1
for x in range(items.shape[0]):
dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
pos = 0
for y in range(data.shape[0]):
if data[y, col] == items[x]:
dict[items[x]][pos] = data[y]
pos += 1
if delete:
dict[items[x]] = np.delete(dict[items[x]], col, 1)
return items, dict
def entropy(S):
items = np.unique(S)
if items.size == 1:
return 0
counts = np.zeros((items.shape[0], 1))
sums = 0
for x in range(items.shape[0]):
counts[x] = sum(S == items[x]) / (S.size * 1.0)
for count in counts:
sums += -1 * count * math.log(count, 2)
return sums
def gain_ratio(data, col):
items, dict = subtables(data, col, delete=False)
total_size = data.shape[0]
entropies = np.zeros((items.shape[0], 1))
intrinsic = np.zeros((items.shape[0], 1))
for x in range(items.shape[0]):
ratio = dict[items[x]].shape[0]/(total_size * 1.0)
entropies[x] = ratio * entropy(dict[items[x]][:, -1])
intrinsic[x] = ratio * math.log(ratio, 2)
total_entropy = entropy(data[:, -1])
iv = -1 * sum(intrinsic)
for x in range(entropies.shape[0]):
total_entropy -= entropies[x]
return total_entropy / iv
def create_node(data, metadata):
if (np.unique(data[:, -1])).shape[0] == 1:
node = Node("")
node.answer = np.unique(data[:, -1])[0]
return node
gains = np.zeros((data.shape[1] - 1, 1))
for col in range(data.shape[1] - 1):
gains[col] = gain_ratio(data, col)
split = np.argmax(gains)
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
items, dict = subtables(data, split, delete=True)
for x in range(items.shape[0]):
child = create_node(dict[items[x]], metadata)
node.children.append((items[x], child))
return node
def empty(size):
s = ""
for x in range(size):
s += " "
return s
def print_tree(node, level):
if node.answer != "":
print(empty(level), node.answer)
return
print(empty(level), node.attribute)
for value, n in node.children:
print(empty(level + 1), value)
print_tree(n, level + 2)
metadata, traindata = read_data("tennisdata.csv")
data = np.array(traindata)
node = create_node(data, metadata)
print_tree(node, 0)
STEPS & OUTPUT:
to view steps & output click HERE
SOLUTION 1 ( with packages) (given by Lokesh sir)
tennisdata.csv
Outlook
|
Temperature
|
Humidity
|
Windy
|
PlayTennis
|
Sunny
|
Hot
|
High
|
FALSE
|
No
|
Sunny
|
Hot
|
High
|
TRUE
|
No
|
Overcast
|
Hot
|
High
|
FALSE
|
Yes
|
Rainy
|
Mild
|
High
|
FALSE
|
Yes
|
Rainy
|
Cool
|
Normal
|
FALSE
|
Yes
|
Rainy
|
Cool
|
Normal
|
TRUE
|
No
|
Overcast
|
Cool
|
Normal
|
TRUE
|
Yes
|
Sunny
|
Mild
|
High
|
FALSE
|
No
|
Sunny
|
Cool
|
Normal
|
FALSE
|
Yes
|
Rainy
|
Mild
|
Normal
|
FALSE
|
Yes
|
Sunny
|
Mild
|
Normal
|
TRUE
|
Yes
|
Overcast
|
Mild
|
High
|
TRUE
|
Yes
|
Overcast
|
Hot
|
Normal
|
FALSE
|
Yes
|
Rainy
|
Mild
|
High
|
TRUE
|
No
|
lab3.py
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO
data = pd.read_csv('tennisdata.csv')
print("The first 5 values of data is \n",data.head())
X = data.iloc[:,:-1]
print("\nThe first 5 values of Train data is \n",X.head())
y = data.iloc[:,-1]
print("\nThe first 5 values of Train output is \n",y.head())
le_outlook = LabelEncoder()
X.Outlook = le_outlook.fit_transform(X.Outlook)
le_Temperature = LabelEncoder()
X.Temperature = le_Temperature.fit_transform(X.Temperature)
le_Humidity = LabelEncoder()
X.Humidity = le_Humidity.fit_transform(X.Humidity)
le_Windy = LabelEncoder()
X.Windy = le_Windy.fit_transform(X.Windy)
print("\nNow the Train data is",X.head())
le_PlayTennis = LabelEncoder()
y = le_PlayTennis.fit_transform(y)
print("\nNow the Train data is\n",y)
classifier = DecisionTreeClassifier()
classifier.fit(X,y)
def labelEncoderForInput(list1):
list1[0] = le_outlook.transform([list1[0]])[0]
list1[1] = le_Temperature.transform([list1[1]])[0]
list1[2] = le_Humidity.transform([list1[2]])[0]
list1[3] = le_Windy.transform([list1[3]])[0]
return [list1]
inp = ["Rainy","Mild","High","False"]
inp1=["Rainy","Cool","High","False"]
pred1 = labelEncoderForInput(inp1)
y_pred = classifier.predict(pred1)
y_pred
print("\nfor input {0}, we obtain {1}".format(inp1, le_PlayTennis.inverse_transform(y_pred[0])))
STEPS & OUTPUT:
to view steps & output click HERE
SOLUTION 2 ( without packages)
tennisdata.csv
Outlook
|
Temperature
|
Humidity
|
Windy
|
PlayTennis
|
Sunny
|
Hot
|
High
|
FALSE
|
No
|
Sunny
|
Hot
|
High
|
TRUE
|
No
|
Overcast
|
Hot
|
High
|
FALSE
|
Yes
|
Rainy
|
Mild
|
High
|
FALSE
|
Yes
|
Rainy
|
Cool
|
Normal
|
FALSE
|
Yes
|
Rainy
|
Cool
|
Normal
|
TRUE
|
No
|
Overcast
|
Cool
|
Normal
|
TRUE
|
Yes
|
Sunny
|
Mild
|
High
|
FALSE
|
No
|
Sunny
|
Cool
|
Normal
|
FALSE
|
Yes
|
Rainy
|
Mild
|
Normal
|
FALSE
|
Yes
|
Sunny
|
Mild
|
Normal
|
TRUE
|
Yes
|
Overcast
|
Mild
|
High
|
TRUE
|
Yes
|
Overcast
|
Hot
|
Normal
|
FALSE
|
Yes
|
Rainy
|
Mild
|
High
|
TRUE
|
No
|
lab3.py
import numpy as np
import math
import csv
def read_data(filename):
with open(filename, 'r') as csvfile:
datareader = csv.reader(csvfile, delimiter=',')
headers = next(datareader)
metadata = []
traindata = []
for name in headers:
metadata.append(name)
for row in datareader:
traindata.append(row)
return (metadata, traindata)
class Node:
def __init__(self, attribute):
self.attribute = attribute
self.children = []
self.answer = ""
def __str__(self):
return self.attribute
def subtables(data, col, delete):
dict = {}
items = np.unique(data[:, col])
count = np.zeros((items.shape[0], 1), dtype=np.int32)
for x in range(items.shape[0]):
for y in range(data.shape[0]):
if data[y, col] == items[x]:
count[x] += 1
for x in range(items.shape[0]):
dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
pos = 0
for y in range(data.shape[0]):
if data[y, col] == items[x]:
dict[items[x]][pos] = data[y]
pos += 1
if delete:
dict[items[x]] = np.delete(dict[items[x]], col, 1)
return items, dict
def entropy(S):
items = np.unique(S)
if items.size == 1:
return 0
counts = np.zeros((items.shape[0], 1))
sums = 0
for x in range(items.shape[0]):
counts[x] = sum(S == items[x]) / (S.size * 1.0)
for count in counts:
sums += -1 * count * math.log(count, 2)
return sums
def gain_ratio(data, col):
items, dict = subtables(data, col, delete=False)
total_size = data.shape[0]
entropies = np.zeros((items.shape[0], 1))
intrinsic = np.zeros((items.shape[0], 1))
for x in range(items.shape[0]):
ratio = dict[items[x]].shape[0]/(total_size * 1.0)
entropies[x] = ratio * entropy(dict[items[x]][:, -1])
intrinsic[x] = ratio * math.log(ratio, 2)
total_entropy = entropy(data[:, -1])
iv = -1 * sum(intrinsic)
for x in range(entropies.shape[0]):
total_entropy -= entropies[x]
return total_entropy / iv
def create_node(data, metadata):
if (np.unique(data[:, -1])).shape[0] == 1:
node = Node("")
node.answer = np.unique(data[:, -1])[0]
return node
gains = np.zeros((data.shape[1] - 1, 1))
for col in range(data.shape[1] - 1):
gains[col] = gain_ratio(data, col)
split = np.argmax(gains)
node = Node(metadata[split])
metadata = np.delete(metadata, split, 0)
items, dict = subtables(data, split, delete=True)
for x in range(items.shape[0]):
child = create_node(dict[items[x]], metadata)
node.children.append((items[x], child))
return node
def empty(size):
s = ""
for x in range(size):
s += " "
return s
def print_tree(node, level):
if node.answer != "":
print(empty(level), node.answer)
return
print(empty(level), node.attribute)
for value, n in node.children:
print(empty(level + 1), value)
print_tree(n, level + 2)
metadata, traindata = read_data("tennisdata.csv")
data = np.array(traindata)
node = create_node(data, metadata)
print_tree(node, 0)
STEPS & OUTPUT:
to view steps & output click HERE
if u have ml remaining programs, please upload bro.
ReplyDelete