# <center>CS568:Deep Learning</center>  <center>Spring 2020</center> 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Load Data

In [0]:
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras import layers
from keras.optimizers import Adam
import sys

max_length = 2
learning_rate = 0.01
num_epochs = 100
batch_size = 1000

def load_data(filename):
    df = pd.read_csv(filename)
    df = df.filter(['Name'])
    df = np.array(df)
    return df

def preprocess_data(data):    
    inputs = []
    targets = []    
    vocab = ''   
    for item in data:       
        item = str(np.squeeze(item))       
        
        # track all possible characters to generate
        vocab += item
        
        # create tokens from each name
        for i in range(len(item) - max_length):
            inputs.append(item[i : i + max_length])
            targets.append(item[i + max_length])

    # get list of unique characters to generate from
    chars = sorted(list(set(vocab)))
    data_size, chars_size = len(data), len(chars)
    print("Data has {} characters, {} unique".format(str(data_size), str(chars_size)))

    char_indices = dict((ch, chars.index(ch)) for ch in chars)
    
    # create empty numpy arrays for X and y
    X = np.zeros((len(inputs), max_length, chars_size), dtype=np.bool)
    t = np.zeros((len(inputs), chars_size), dtype=np.bool)

    # one-hot encode selections
    for inp, indiv_input in enumerate(inputs):
        for tar, indiv_char in enumerate(indiv_input):
            X[inp, tar, char_indices[indiv_char]] = 1
        t[inp, char_indices[targets[inp]]] = 1
    
    return X, t, chars, char_indices

## Define model

In [0]:
def build_model(chars_length):
    model = Sequential()
    model.add(layers.LSTM(128, input_shape=(max_length, chars_length)))
    model.add(layers.Dense(chars_length, activation='softmax'))     
    model.compile(loss='categorical_crossentropy', optimizer= Adam(lr = learning_rate))    
    return model

In [0]:
def sample(output, total):
    output = np.asarray(output).astype('float64')
    output = np.log(output) / total
    exp_output = np.exp(output)
    output = exp_output / np.sum(exp_output)
    probas = np.random.multinomial(1, output, 1)
    out = np.argmax(probas)
    return out
  
def generate_names(seed, length, chars, char_indices, char_length):
    generated_text = seed
    name = seed

    for i in range(length - len(seed)):
        sampled = np.zeros((1, max_length, char_length))

        for tar, char in enumerate(generated_text):   
            # print(generated_text)      
            # print(char, char_indices[char], tar) 
            sampled[0, tar, char_indices[char]] = 1.   

        preds = model.predict(sampled, verbose=0)[0]
        next_index = sample(preds, 0.5)
        next_char = chars[next_index]

        generated_text += next_char
        generated_text = generated_text[1:]

        name += next_char
        
    return name


In [6]:
data = load_data("/content/drive/My Drive/muslim_girls_names.csv")
print(data.shape, data.dtype)
X, t, chars, char_indices = preprocess_data(data)
print("X.shape ", X.shape)
print("t.shape ", t.shape)

model = build_model(len(chars))
model.summary()
model.fit(X, t, epochs=num_epochs, batch_size=batch_size)

(4442, 1) object
Data has 4442 characters, 52 unique
X.shape  (19500, 2, 52)
t.shape  (19500, 52)
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 128)               92672     
_________________________________________________________________
dense_1 (Dense)              (None, 52)                6708      
Total params: 99,380
Trainable params: 99,380
Non-trainable params: 0
_________________________________________________________________
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 3

<keras.callbacks.callbacks.History at 0x7fce80357ef0>

In [11]:
generate_names("Ab", 7, chars, char_indices, len(chars))

'Abeerah'

In [17]:
generate_names("An", 7, chars, char_indices, len(chars))

'Anaheen'

In [18]:
generate_names("Mi", 7, chars, char_indices, len(chars))

'Mishadi'

In [31]:
generate_names("Ma", 5, chars, char_indices, len(chars))

'Mahid'

In [34]:
generate_names("Z", 6, chars, char_indices, len(chars))

'Zairam'

In [38]:
generate_names("Za", 8, chars, char_indices, len(chars))

'Zareenah'