Wednesday, July 3, 2019

ML 1 - FIND-S ALGORITHM

1. IMPLEMENT AND DEMONSTRATE THE FIND-S ALGORITHM FOR FINDING THE MOST SPECIFIC HYPOTHESIS BASED ON A GIVEN SET OF TRAINING DATA SAMPLES. READ THE TRAINING DATA FROM A.CSV FILE.


SOLUTION  1 - To display only the final output

trainingdata.csv


Sunny
Warm
Normal
Strong
Warm
Same
Yes
Sunny
Warm
High
Strong
Warm
Same
Yes
Rainy
Cold
High
Strong
Warm
Change
No
Sunny
Warm
High
Strong
Cool
Change
Yes

prog1.py


import csv

h=['0'for i in range(6)]
with open("trainingdata.csv") as f:
    data=csv.reader(f)
    data=list(data)
    
    for i in data:
        if i[-1]=="Yes":
            for j in range(6):
                if h[j]=='0':
                    h[j]=i[j]
                elif h[j]!=i[j]:
                    h[j]='?'

    print(h)

Output

['Sunny', 'Warm', '?', 'Strong', '?', '?']

OR

SOLUTION  1 - To display steps & final output

import csv

h=['0'for i in range(6)]
with open("trainingdata.csv") as f:
    data=csv.reader(f)
    data=list(data)
    
    print("The +ve examples are:")
    for i in data:
        if i[-1]=="Yes":
            print(i)
     
    print("\nThe steps of Find-S Algo are:")
    for i in data:
        if i[-1]=="Yes":
            for j in range(6):
                if h[j]=='0':
                    h[j]=i[j]
                elif h[j]!=i[j]:
                    h[j]='?'
                    print(h)

    print("\nFinal specific hypothesis:\n",h)

Output

The +ve examples are:
['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same', 'Yes']
['Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same', 'Yes']
['Sunny', 'Warm', 'High', 'Strong', 'Cool', 'Change', 'Yes']

The steps of Find-S Algo are:
['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same']
['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same']
['Sunny', 'Warm', '?', 'Strong', '?', 'Same']
['Sunny', 'Warm', '?', 'Strong', '?', '?']

Final specific hypothesis:
 ['Sunny', 'Warm', '?', 'Strong', '?', '?']

SOLUTION  2 

trainingdata.csv 


Sunny
Warm
Normal
Strong
Warm
Same
Yes
Sunny
Warm
High
Strong
Warm
Same
Yes
Rainy
Cold
High
Strong
Warm
Change
No
Sunny
Warm
High
Strong
Cool
Change
Yes

lab1.py

import csv
hypo = ['%','%','%','%','%','%'];

with open('trainingdata.csv') as csv_file:
    readcsv = csv.reader(csv_file, delimiter=',')
    print(readcsv)

    data = []
    print("\nThe given training examples are:")
    for row in readcsv:
        print(row)
        if row[len(row)-1].upper() == "YES":
            data.append(row)

print("\nThe positive examples are:");
for x in data:
    print(x);
print("\n");

TotalExamples = len(data);
i=0;
j=0;
k=0;
print("The steps of the Find-s algorithm are :\n",hypo);
list = [];
p=0;
d=len(data[p])-1;
for j in range(d):
    list.append(data[i][j]);
hypo=list;
i=1;
for i in range(TotalExamples):
    for k in range(d):
        if hypo[k]!=data[i][k]:
            hypo[k]='?';
            k=k+1;   
        else:
            hypo[k];
    print(hypo);
i=i+1;

print("\nThe maximally specific Find-s hypothesis for the given training examples is :");
list=[];
for i in range(d):
    list.append(hypo[i]);
print(list);

STEPS & OUTPUT:

to view steps & output click HERE

ML 2 - CANDIDATE-ELIMINATION ALGORITHM

2. FOR A GIVEN SET OF TRAINING DATA EXAMPLES STORED IN A .CSV FILE, IMPLEMENT AND DEMONSTRATE THE CANDIDATE-ELIMINATION ALGORITHM TO OUTPUT A DESCRIPTION OF THE SET OF ALL HYPOTHESES CONSISTENT WITH THE TRAINING EXAMPLES.


SOLUTION  1 

trainingdata.csv


Sunny
Warm
Normal
Strong
Warm
Same
Yes
Sunny
Warm
High
Strong
Warm
Same
Yes
Rainy
Cold
High
Strong
Warm
Change
No
Sunny
Warm
High
Strong
Cool
Change
Yes

prog2.py


import csv

with open("trainingdata.csv") as f:
    csv_file=csv.reader(f)
    data=list(csv_file)
    
    s=data[1][:-1]
    g=[['?' for i in range(len(s))] for j in range(len(s))]
    
    for i in data:
        if i[-1]=="Yes":
            for j in range(len(s)):
                if i[j]!=s[j]:
                    s[j]='?'
                    g[j][j]='?'
        
        elif i[-1]=="No":
            for j in range(len(s)):
                if i[j]!=s[j]:
                    g[j][j]=s[j]
                else:
                    g[j][j]="?"
        print("\nSteps of Candidate Elimination Algorithm",data.index(i)+1)
        print(s)
        print(g)
    gh=[]
    for i in g:
        for j in i:
            if j!='?':
                gh.append(i)
                break
    print("\nFinal specific hypothesis:\n",s)

    print("\nFinal general hypothesis:\n",gh)

Output

Steps of Candidate Elimination Algorithm 1
['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same']
[['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]

Steps of Candidate Elimination Algorithm 2
['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same']
[['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]

Steps of Candidate Elimination Algorithm 3
['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same']
[['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', 'Same']]

Steps of Candidate Elimination Algorithm 4
['Sunny', 'Warm', '?', 'Strong', '?', '?']
[['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?'], ['?', '?', '?', '?', '?', '?']]

Final specific hypothesis:
 ['Sunny', 'Warm', '?', 'Strong', '?', '?']

Final general hypothesis:

 [['Sunny', '?', '?', '?', '?', '?'], ['?', 'Warm', '?', '?', '?', '?']]

SOLUTION  2 

trainingdata.csv

sky
airTemphumiditywindwaterforecastenjoySport
Sunny
Warm
Normal
Strong
Warm
Same
Yes
Sunny
Warm
High
Strong
Warm
Same
Yes
Rainy
Cold
High
Strong
Warm
Change
No
Sunny
Warm
High
Strong
Cool
Change
Yes

lab2.py

import numpy as np
import pandas as pd

# Loading Data from a CSV File
data = pd.DataFrame(data=pd.read_csv('trainingdata.csv'))
print(data)

# Separating concept features from Target
concepts = np.array(data.iloc[:,0:-1])
print(concepts)

# Isolating target into a separate DataFrame
# copying last column to target array
target = np.array(data.iloc[:,-1])
print(target)

def learn(concepts, target):
 
    '''
    learn() function implements the learning method of the Candidate elimination algorithm.
    Arguments:
        concepts - a data frame with all the features
        target - a data frame with corresponding output values
    '''

    # Initialise S0 with the first instance from concepts
    # .copy() makes sure a new list is created instead of just pointing to the same memory location
    specific_h = concepts[0].copy()
    print("\nInitialization of specific_h and general_h")
    print(specific_h)
    #h=["#" for i in range(0,5)]
    #print(h)

    general_h = [["?" for i in range(len(specific_h))] for i in range(len(specific_h))]
    print(general_h)
    # The learning iterations
    for i, h in enumerate(concepts):

        # Checking if the hypothesis has a positive target
        if target[i] == "Yes":
            for x in range(len(specific_h)):

                # Change values in S & G only if values change
                if h[x] != specific_h[x]:
                    specific_h[x] = '?'
                    general_h[x][x] = '?'

        # Checking if the hypothesis has a positive target
        if target[i] == "No":
            for x in range(len(specific_h)):
                # For negative hyposthesis change values only  in G
                if h[x] != specific_h[x]:
                    general_h[x][x] = specific_h[x]
                else:
                    general_h[x][x] = '?'

        print("\nSteps of Candidate Elimination Algorithm",i+1)
        print(specific_h)
        print(general_h)
 
    # find indices where we have empty rows, meaning those that are unchanged
    indices = [i for i, val in enumerate(general_h) if val == ['?', '?', '?', '?', '?', '?']]
    for i in indices:
        # remove those rows from general_h
        general_h.remove(['?', '?', '?', '?', '?', '?'])
    # Return final values
    return specific_h, general_h

s_final, g_final = learn(concepts, target)
print("\nFinal Specific_h:", s_final, sep="\n")
print("\nFinal General_h:", g_final, sep="\n")


STEPS & OUTPUT:

to view steps & output click HERE

ML 3 - ID3 ALGORITHM

3. WRITE A PROGRAM TO DEMONSTRATE THE WORKING OF THE DECISION TREE BASED ID3 ALGORITHM. USE AN APPROPRIATE DATA SET FOR BUILDING THE DECISION TREE AND APPLY THIS KNOWLEDGE TO CLASSIFY A NEW SAMPLE.

 SOLUTION 1  ( with packages) (given by Lokesh sir)

tennisdata.csv


Outlook
Temperature
Humidity
Windy
PlayTennis
Sunny
Hot
High
FALSE
No
Sunny
Hot
High
TRUE
No
Overcast
Hot
High
FALSE
Yes
Rainy
Mild
High
FALSE
Yes
Rainy
Cool
Normal
FALSE
Yes
Rainy
Cool
Normal
TRUE
No
Overcast
Cool
Normal
TRUE
Yes
Sunny
Mild
High
FALSE
No
Sunny
Cool
Normal
FALSE
Yes
Rainy
Mild
Normal
FALSE
Yes
Sunny
Mild
Normal
TRUE
Yes
Overcast
Mild
High
TRUE
Yes
Overcast
Hot
Normal
FALSE
Yes
Rainy
Mild
High
TRUE
No

lab3.py

import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO


data = pd.read_csv('tennisdata.csv')
print("The first 5 values of data is \n",data.head())

X = data.iloc[:,:-1]
print("\nThe first 5 values of Train data is \n",X.head())
y = data.iloc[:,-1]
print("\nThe first 5 values of Train output is \n",y.head())


le_outlook = LabelEncoder()
X.Outlook =  le_outlook.fit_transform(X.Outlook)
le_Temperature = LabelEncoder()
X.Temperature =  le_Temperature.fit_transform(X.Temperature)
le_Humidity = LabelEncoder()
X.Humidity =  le_Humidity.fit_transform(X.Humidity)
le_Windy = LabelEncoder()
X.Windy =  le_Windy.fit_transform(X.Windy)

print("\nNow the Train data is",X.head())

le_PlayTennis = LabelEncoder()
y =  le_PlayTennis.fit_transform(y)
print("\nNow the Train data is\n",y)

classifier = DecisionTreeClassifier()
classifier.fit(X,y)

def labelEncoderForInput(list1):
    list1[0] =  le_outlook.transform([list1[0]])[0]
    list1[1] =  le_Temperature.transform([list1[1]])[0]
    list1[2] =  le_Humidity.transform([list1[2]])[0]
    list1[3] =  le_Windy.transform([list1[3]])[0]
    return [list1]

inp = ["Rainy","Mild","High","False"]
inp1=["Rainy","Cool","High","False"]
pred1 = labelEncoderForInput(inp1)
y_pred = classifier.predict(pred1)
y_pred
print("\nfor input {0}, we obtain {1}".format(inp1, le_PlayTennis.inverse_transform(y_pred[0])))

STEPS & OUTPUT:

to view steps & output click HERE

 SOLUTION  2   ( without packages)

tennisdata.csv


Outlook
Temperature
Humidity
Windy
PlayTennis
Sunny
Hot
High
FALSE
No
Sunny
Hot
High
TRUE
No
Overcast
Hot
High
FALSE
Yes
Rainy
Mild
High
FALSE
Yes
Rainy
Cool
Normal
FALSE
Yes
Rainy
Cool
Normal
TRUE
No
Overcast
Cool
Normal
TRUE
Yes
Sunny
Mild
High
FALSE
No
Sunny
Cool
Normal
FALSE
Yes
Rainy
Mild
Normal
FALSE
Yes
Sunny
Mild
Normal
TRUE
Yes
Overcast
Mild
High
TRUE
Yes
Overcast
Hot
Normal
FALSE
Yes
Rainy
Mild
High
TRUE
No

lab3.py

import numpy as np
import math
import csv

def read_data(filename):
    with open(filename, 'r') as csvfile:
        datareader = csv.reader(csvfile, delimiter=',')
        headers = next(datareader)
        metadata = []
        traindata = []
        for name in headers:
            metadata.append(name)
        for row in datareader:
            traindata.append(row)

    return (metadata, traindata)

class Node:
    def __init__(self, attribute):
        self.attribute = attribute
        self.children = []
        self.answer = ""
     
    def __str__(self):
        return self.attribute

def subtables(data, col, delete):
    dict = {}
    items = np.unique(data[:, col])
    count = np.zeros((items.shape[0], 1), dtype=np.int32) 
 
    for x in range(items.shape[0]):
        for y in range(data.shape[0]):
            if data[y, col] == items[x]:
                count[x] += 1
             
    for x in range(items.shape[0]):
        dict[items[x]] = np.empty((int(count[x]), data.shape[1]), dtype="|S32")
        pos = 0
        for y in range(data.shape[0]):
            if data[y, col] == items[x]:
                dict[items[x]][pos] = data[y]
                pos += 1     
        if delete:
            dict[items[x]] = np.delete(dict[items[x]], col, 1)
     
    return items, dict

def entropy(S):
    items = np.unique(S)

    if items.size == 1:
        return 0
 
    counts = np.zeros((items.shape[0], 1))
    sums = 0
 
    for x in range(items.shape[0]):
        counts[x] = sum(S == items[x]) / (S.size * 1.0)

    for count in counts:
        sums += -1 * count * math.log(count, 2)
    return sums

def gain_ratio(data, col):
    items, dict = subtables(data, col, delete=False)
             
    total_size = data.shape[0]
    entropies = np.zeros((items.shape[0], 1))
    intrinsic = np.zeros((items.shape[0], 1))
 
    for x in range(items.shape[0]):
        ratio = dict[items[x]].shape[0]/(total_size * 1.0)
        entropies[x] = ratio * entropy(dict[items[x]][:, -1])
        intrinsic[x] = ratio * math.log(ratio, 2)
     
    total_entropy = entropy(data[:, -1])
    iv = -1 * sum(intrinsic)
 
    for x in range(entropies.shape[0]):
        total_entropy -= entropies[x]
     
    return total_entropy / iv

def create_node(data, metadata):
    if (np.unique(data[:, -1])).shape[0] == 1:
        node = Node("")
        node.answer = np.unique(data[:, -1])[0]
        return node
     
    gains = np.zeros((data.shape[1] - 1, 1))
 
    for col in range(data.shape[1] - 1):
        gains[col] = gain_ratio(data, col)
     
    split = np.argmax(gains)
 
    node = Node(metadata[split]) 
    metadata = np.delete(metadata, split, 0) 
 
    items, dict = subtables(data, split, delete=True)
 
    for x in range(items.shape[0]):
        child = create_node(dict[items[x]], metadata)
        node.children.append((items[x], child))
 
    return node

def empty(size):
    s = ""
    for x in range(size):
        s += "   "
    return s

def print_tree(node, level):
    if node.answer != "":
        print(empty(level), node.answer)
        return
    print(empty(level), node.attribute)
    for value, n in node.children:
        print(empty(level + 1), value)
        print_tree(n, level + 2)

metadata, traindata = read_data("tennisdata.csv")
data = np.array(traindata)
node = create_node(data, metadata)
print_tree(node, 0)


STEPS & OUTPUT:

to view steps & output click HERE