NASDAG.org

A data scientist blog, by Philippe Dagher

Classifying Bees With Google TensorFlow

There is often confusion about the differences between bumblebees and honeybees and even some of our top media channels will publish pictures of bumblebees when they are discussing/ writing about honeybees.

These bees have different behaviors and appearances, but given the variety of backgrounds, positions, and image resolutions it can be a challenge for machines to tell them apart.

Wild bees are important pollinators and the spread of colony collapse disorder has only made their role more critical.

In this post, we will build a basic Tensorflow algorithm to determine the genus—Apis (honey bee) or Bombus (bumble bee)—based on photographs of the insects. The purpose is to test Google Tensorflow and not to reach the 99.56 accuracy obtained during the Metis challenge.


The dataset is available here. It contains 3969 labeled images scaled at 200px x 200px. To speed up the process we will resize them to 100x100. We will also use random rotations during the learning phase. I also added random flipping starting step 2500.

1
2
import numpy as np
import pandas as pd
1
2
3
4
bees = pd.read_csv('bees/train_labels.csv')
bees = bees.reindex(np.random.permutation(bees.index))

print len(bees), len(bees[bees.genus==1]), len(bees[bees.genus==0])
3969 3142 827
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from scipy.ndimage import imread
from scipy.misc import imresize

bees['images'] = [imresize(imread('bees/images/train/'+str(bee)+'.jpg'),
                           (100, 100))[:, :, :3] for bee in bees.id]

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(12, 4))
for i in range(16):
    plt.subplot(2, 8, i+1)
    plt.imshow(bees.images[i+333])
    plt.xticks([])
    plt.yticks([])
    plt.title(["Bombus", "Apis"][int(bees.genus[i+333])])
plt.tight_layout()

png

1
2
import tensorflow as tf
sess = tf.InteractiveSession()
1
2
3
4
5
6
7
def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)
1
2
3
4
5
6
def conv2d(x, W):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
  return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                        strides=[1, 2, 2, 1], padding='SAME')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
x = tf.placeholder(tf.float32, [None, 100, 100, 3])
y_ = tf.placeholder(tf.float32, [None, 2])

W_conv1 = weight_variable([5, 5, 3, 32])
b_conv1 = bias_variable([32])

h_conv1 = tf.nn.relu(conv2d(x, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

W_fc1 = weight_variable([25*25*64, 250])
b_fc1 = bias_variable([250])

h_pool2_flat = tf.reshape(h_pool2, [-1, 25*25*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

keep_prob = tf.placeholder("float")
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

W_fc2 = weight_variable([250, 2])
b_fc2 = bias_variable([2])

y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from random import sample, choice
from scipy.ndimage.interpolation import rotate

y_true = pd.get_dummies(bees.genus)

training_rows = range(0,3000)
test_rows = range(3000,3969)

X_test = np.concatenate([arr[np.newaxis] for arr in bees.images.loc[test_rows]/256.])

cross_entropy = -tf.reduce_sum(y_*tf.log(y_conv + 1e-9))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
sess.run(tf.initialize_all_variables())

for i in range(2501):
    batch_rows = sample(training_rows, 50)
    X_training = np.concatenate([arr[np.newaxis] for arr in \
                                 bees.images.loc[batch_rows].apply(lambda x: rotate(x, choice([0,90,180,270]))/256.)])

    if i%100 == 0:
        train_accuracy = accuracy.eval(feed_dict={x: X_training,
                                                  y_: y_true.loc[batch_rows].values, 
                                                  keep_prob: 1.0})
        print "step {:d}, training accuracy {:.3f}".format(i, train_accuracy), 
        print "- test accuracy {:.3f}".format(accuracy.eval(feed_dict={x: X_test,
                                                                       y_: y_true.loc[test_rows].values,
                                                                       keep_prob: 1.0}))
    train_step.run(feed_dict={x: X_training,
                              y_: y_true.loc[batch_rows].values, 
                              keep_prob: 0.5})
step 0, training accuracy 0.440 - test accuracy 0.488
step 100, training accuracy 0.760 - test accuracy 0.795
step 200, training accuracy 0.840 - test accuracy 0.791
step 300, training accuracy 0.760 - test accuracy 0.796
step 400, training accuracy 0.880 - test accuracy 0.798
step 500, training accuracy 0.820 - test accuracy 0.796
step 600, training accuracy 0.940 - test accuracy 0.796
step 700, training accuracy 0.740 - test accuracy 0.806
step 800, training accuracy 0.800 - test accuracy 0.802
step 900, training accuracy 0.820 - test accuracy 0.797
step 1000, training accuracy 0.800 - test accuracy 0.804
step 1100, training accuracy 0.840 - test accuracy 0.802
step 1200, training accuracy 0.860 - test accuracy 0.801
step 1300, training accuracy 0.820 - test accuracy 0.803
step 1400, training accuracy 0.860 - test accuracy 0.810
step 1500, training accuracy 0.800 - test accuracy 0.808
step 1600, training accuracy 0.780 - test accuracy 0.807
step 1700, training accuracy 0.820 - test accuracy 0.813
step 1800, training accuracy 0.880 - test accuracy 0.814
step 1900, training accuracy 0.860 - test accuracy 0.808
step 2000, training accuracy 0.940 - test accuracy 0.804
step 2100, training accuracy 0.820 - test accuracy 0.824
step 2200, training accuracy 0.860 - test accuracy 0.821
step 2300, training accuracy 0.880 - test accuracy 0.807
step 2400, training accuracy 0.900 - test accuracy 0.827
step 2500, training accuracy 0.880 - test accuracy 0.836
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
for i in range(2555,10000):
    batch_rows = sample(training_rows, 50)
    X_training = np.concatenate([arr[np.newaxis] for arr in \
                                 bees.images.loc[batch_rows]\
                                         .apply(choice([(lambda x: x), np.fliplr, np.flipud]))\
                                         .apply(lambda x: rotate(x, choice([0,90,180,270]))/256.)])

    if i%100 == 0:
        train_accuracy = accuracy.eval(feed_dict={x: X_training,
                                                  y_: y_true.loc[batch_rows].values, 
                                                  keep_prob: 1.0})
        print "step {:d}, training accuracy {:.3f}".format(i, train_accuracy), 
        print "- test accuracy {:.3f}".format(accuracy.eval(feed_dict={x: X_test,
                                                                       y_: y_true.loc[test_rows].values,
                                                                       keep_prob: 1.0}))
    train_step.run(feed_dict={x: X_training,
                              y_: y_true.loc[batch_rows].values, 
                              keep_prob: 0.5})
step 2600, training accuracy 0.780 - test accuracy 0.820
step 2700, training accuracy 0.840 - test accuracy 0.810
step 2800, training accuracy 0.900 - test accuracy 0.831
step 2900, training accuracy 0.800 - test accuracy 0.831
step 3000, training accuracy 0.880 - test accuracy 0.828
step 3100, training accuracy 0.880 - test accuracy 0.843
step 3200, training accuracy 0.900 - test accuracy 0.825
step 3300, training accuracy 0.940 - test accuracy 0.831
step 3400, training accuracy 0.840 - test accuracy 0.834
step 3500, training accuracy 0.900 - test accuracy 0.846
step 3600, training accuracy 0.780 - test accuracy 0.841
step 3700, training accuracy 0.780 - test accuracy 0.849
step 3800, training accuracy 0.820 - test accuracy 0.842
step 3900, training accuracy 0.860 - test accuracy 0.852
step 4000, training accuracy 0.960 - test accuracy 0.837
step 4100, training accuracy 0.880 - test accuracy 0.841
step 4200, training accuracy 0.920 - test accuracy 0.845
step 4300, training accuracy 0.920 - test accuracy 0.851
step 4400, training accuracy 0.840 - test accuracy 0.858
step 4500, training accuracy 0.880 - test accuracy 0.854
step 4600, training accuracy 0.900 - test accuracy 0.854
step 4700, training accuracy 0.940 - test accuracy 0.847
step 4800, training accuracy 0.900 - test accuracy 0.844
step 4900, training accuracy 0.940 - test accuracy 0.856
step 5000, training accuracy 0.960 - test accuracy 0.848
step 5100, training accuracy 0.900 - test accuracy 0.854
step 5200, training accuracy 0.960 - test accuracy 0.852
step 5300, training accuracy 0.940 - test accuracy 0.852
step 5400, training accuracy 0.940 - test accuracy 0.846
step 5500, training accuracy 0.860 - test accuracy 0.854
step 5600, training accuracy 0.920 - test accuracy 0.851
step 5700, training accuracy 0.920 - test accuracy 0.849
step 5800, training accuracy 0.960 - test accuracy 0.857
step 5900, training accuracy 0.980 - test accuracy 0.858
step 6000, training accuracy 0.960 - test accuracy 0.850
step 6100, training accuracy 0.940 - test accuracy 0.850
step 6200, training accuracy 0.860 - test accuracy 0.852
step 6300, training accuracy 1.000 - test accuracy 0.856
step 6400, training accuracy 0.980 - test accuracy 0.851
step 6500, training accuracy 0.960 - test accuracy 0.859
step 6600, training accuracy 0.960 - test accuracy 0.851
step 6700, training accuracy 0.960 - test accuracy 0.852
step 6800, training accuracy 0.980 - test accuracy 0.860
step 6900, training accuracy 1.000 - test accuracy 0.857
step 7000, training accuracy 1.000 - test accuracy 0.860
step 7100, training accuracy 0.960 - test accuracy 0.850
step 7200, training accuracy 0.920 - test accuracy 0.856
step 7300, training accuracy 0.960 - test accuracy 0.863
step 7400, training accuracy 1.000 - test accuracy 0.854
step 7500, training accuracy 1.000 - test accuracy 0.866
step 7600, training accuracy 1.000 - test accuracy 0.854
step 7700, training accuracy 1.000 - test accuracy 0.861
step 7800, training accuracy 1.000 - test accuracy 0.864
step 7900, training accuracy 0.960 - test accuracy 0.846
step 8000, training accuracy 0.940 - test accuracy 0.861
step 8100, training accuracy 0.980 - test accuracy 0.857
step 8200, training accuracy 1.000 - test accuracy 0.853
step 8300, training accuracy 1.000 - test accuracy 0.858
step 8400, training accuracy 0.980 - test accuracy 0.853
step 8500, training accuracy 0.960 - test accuracy 0.860
step 8600, training accuracy 0.980 - test accuracy 0.862
step 8700, training accuracy 0.960 - test accuracy 0.853
step 8800, training accuracy 1.000 - test accuracy 0.864
step 8900, training accuracy 1.000 - test accuracy 0.857
step 9000, training accuracy 1.000 - test accuracy 0.854
step 9100, training accuracy 1.000 - test accuracy 0.860
step 9200, training accuracy 0.980 - test accuracy 0.866
step 9300, training accuracy 0.980 - test accuracy 0.866
step 9400, training accuracy 1.000 - test accuracy 0.868
step 9500, training accuracy 1.000 - test accuracy 0.863
step 9600, training accuracy 1.000 - test accuracy 0.864
step 9700, training accuracy 1.000 - test accuracy 0.865
step 9800, training accuracy 0.980 - test accuracy 0.854
step 9900, training accuracy 1.000 - test accuracy 0.857