Teaching my Computer to Read

I have been doing a lot of work with letters and fonts lately in Printing Code class, so I thought it might be interesting to extend some of that into my Makematics project this week. One application of machine learning is identifying text–I read recently that Google Translate wants to introduce a feature that would allow users to submit pictures of words they see and receive translations in response.

I tweaked Greg’s sample code to create a model based on a series of letter images that I made in Photoshop. I made ten training images and ten test images for each of the letters A, B, and C. Some samples of those images appear at the beginning of the video during the training sequence. In the live video example, the computer was able to use that model to figure out what letter I was holding up. It’s not perfect–the letters have to take up a certain percentage of the box to get an accurate reading–but it’s pretty impressive.

Letter Recognizer code (training):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/*Kim Ash
Makematics Fall 2012
letterRecognizer - Uses a trained Support Vector Machine to letter characters in live video.
 
derived from:
HandletterRecognizerInteractive by Greg Borenstein, October 2012
Distributed as part of PSVM: http://makematics.com/code/psvm
Depends on HoG Processing: http://hogprocessing.altervista.org/
*/
 
// uses both HoG Processing and Processing-SVM
import hog.*;
import psvm.*;
 
PImage img;
 
SVM model;
int[] labels; // 1 = A, 2 = B, 3 = C
String[] trainingFilenames, testFilenames;
float[][] trainingFeatures;
 
PImage testImage;
double testResult = 0.0;
 
void setup() {
  size(200, 100); 
 
  // get the names of all of the files in the "train" folder
  java.io.File folder = new java.io.File(dataPath("train"));
  trainingFilenames = folder.list();
 
  // setup labels array with space for labels for each
  // training file
  labels = new int[trainingFilenames.length];
  trainingFeatures = new float[trainingFilenames.length][324];
 
  // load in the labels based on the first part of 
  // the training images' filenames
  for (int i = 0; i < trainingFilenames.length; i++) {
    println(trainingFilenames[i]);
    // split the filename by "-" and look at the first part
    // to decide the label
    String letterLabel = split(trainingFilenames[i], '-')[0];
    if (letterLabel.equals("A")) {
      labels[i] = 1;
    }
 
    if (letterLabel.equals("B")) {
      labels[i] = 2;
    }
 
    if (letterLabel.equals("C")) {
      labels[i] = 3;
    }
 
    if (letterLabel.equals("V")) {
      labels[i] = 4;
    }
 
    if (letterLabel.equals("Five")) {
      labels[i] = 5;
    }
 
    if (letterLabel.equals("Point")) {
      labels[i] = 6;
    }
 
    // calculate the Histogram of Oriented Gradients for this image
    // use its results as a training vector in our SVM
    trainingFeatures[i] = gradientsForImage(loadImage("train/" + trainingFilenames[i]));
  }
 
  model = new SVM(this);
  SVMProblem problem = new SVMProblem();
  // HoG always gives us back 324 gradients
  // for a 50x50 image.
  problem.setNumFeatures(324);
  // load the problem with the labels and training data
  problem.setSampleData(labels, trainingFeatures);
  // train the model
  model.train(problem);
 
  // save our model file as a text file
  model.saveModel("letter_model.txt");
 
  // get a list of the names of the files in the "test" folder
  java.io.File testFolder = new java.io.File(dataPath("test"));
  testFilenames = testFolder.list();
  // test a new random image
  testResult = testNewImage();
}
 
// Function to test a new random image from the test folder
// it returns the result of the SVM classification
double testNewImage() {
  // pick a random number between 0 and the number of test images
  int imgNum = (int)random(0, testFilenames.length-1);
  // load a test image
  testImage = loadImage("test/" + testFilenames[imgNum]);
  return model.test(gradientsForImage(testImage));
}
 
void draw() {
  background(0);
  image(testImage, 0, 0);
 
  String result = "Letter is: ";
 
  // display the name of the letter
  // in a different color depending on
  // the result of our SVM test
  switch((int)testResult) {
  case 1:
    fill(255, 125, 125);
    result = result + "A";
    break;
  case 2:
    fill(125, 255, 125);
    result = result + "B";
    break;
  case 3:
    fill(125, 125, 255);
    result = result + "C";
    break;
  }
 
 
  text(result, testImage.width + 10, 20);
}
 
void keyPressed() {
  testResult = testNewImage();
}
 
// Use HoG to calculate the gradients for an image.
// We'll use this as our feature vector for our SVM.
// HoG describes the shape of the object as transitions
// between bright and dark pixels.
// This function includes a lot of verbose and ugly
// code because of the HoG library.
float[] gradientsForImage(PImage img) {
  // resize the images to a consistent size:
  //img.resize(50, 50);
 
  // settings for Histogram of Oriented Gradients
  // (probably don't change these)
  int window_width=64;
  int window_height=128;
  int bins = 9;
  int cell_size = 8;
  int block_size = 2;
  boolean signed = false;
  int overlap = 0;
  int stride=16;
  int number_of_resizes=5;
 
  // a bunch of unecessarily verbose HOG code
  HOG_Factory hog = HOG.createInstance();
  GradientsComputation gc=hog.createGradientsComputation();
  Voter voter=MagnitudeItselfVoter.createMagnitudeItselfVoter();
  HistogramsComputation hc=hog.createHistogramsComputation( bins, cell_size, cell_size, signed, voter);
  Norm norm=L2_Norm.createL2_Norm(0.1);
  BlocksComputation bc=hog.createBlocksComputation(block_size, block_size, overlap, norm);
  PixelGradientVector[][] pixelGradients = gc.computeGradients(img, this);
  Histogram[][] histograms = hc.computeHistograms(pixelGradients);
  Block[][] blocks = bc.computeBlocks(histograms);
  Block[][] normalizedBlocks = bc.normalizeBlocks(blocks);
  DescriptorComputation dc=hog.createDescriptorComputation();    
 
  return dc.computeDescriptor(normalizedBlocks);
}

Letter Recognizer Video code:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
/*Kim Ash
Makematics Fall 2012
letterRecognizerVideo - Trains an SVM-based classifier to detect letters (A, B, C) based on Histogram
of Oriented Gradients of letter images.
 
derived from:
HandletterRecognizer by Greg Borenstein, October 2012
Distributed as part of PSVM: http://makematics.com/code/psvm
Depends on HoG Processing: http://hogprocessing.altervista.org/
*/
 
import hog.*;
import psvm.*;
import processing.video.*;
 
// Capture object for accessing video feed
Capture video;
// size of the box we'll be looking in for hand letters
int rectW = 150;
int rectH = 150;
 
SVM model;
PImage testImage;
 
void setup() {
  size(640/2 + 60, 480/2); 
  // capture video at half size for speed
  video = new Capture(this, 640/2, 480/2);
  video.start();   
  // declare our SVM object
  model = new SVM(this);
  // load the trained svm model from the file
  // our data has 324 dimensions because
  // that's what we get from doing Histogram of Oriented
  // Gradients on a 50x50 pixel image
  model.loadModel("letter_model.txt", 324);
  // initialize our PImage at 50x50
  // we'll use this to display the part
  // of the video feed we're searching
  testImage = createImage(50, 50, RGB);
}
 
// video event, necessary for getting live camera
void captureEvent(Capture c) {
  c.read();
}
 
void draw() {
  background(0);
 
  // copy the pixels in the incoming video
  // into our testImage. Only use the pixels
  // inside the 150x150 square at the center
  // also resize down to 50x50 (last two arguments)
  // (the subtractions are to make sure we get the pixels
  // in our red box)
  testImage.copy(video, video.width - rectW - (video.width - rectW)/2, video.height - rectH - (video.height - rectH)/2, rectW, rectH, 0, 0, 50, 50);
 
  // run Histogram of Oriented Gradients on the testImage
  // and pass the results to our model for testing
  double testResult = model.test(gradientsForImage(testImage)); 
 
  // display the video, the test image, and the red box
  image(video, 0, 0);
  image(testImage, width - testImage.width, 0);
  noFill();
  stroke(255, 0, 0);
  strokeWeight(5);
  rect(video.width - rectW - (video.width - rectW)/2, video.height - rectH - (video.height - rectH)/2, rectW, rectH);
 
  // use the result of our SVM test
  // to decide what text to put on the screen
  // based on what letter is showing
  String result = "Letter is: ";
  switch((int)testResult) {
  case 1:
    fill(255, 125, 125);
    result = result + "A";
    break;
  case 2:
    fill(125, 255, 125);
    result = result + "B";
    break;
  case 3:
    fill(125, 125, 255);
    result = result + "C";
    break;
  }
  text(result, 100, 20);
}
 
// Helper function that calculates the 
// Histogram of Oriented Gradients for
// a PImage, filled with a lot of HoG magic
float[] gradientsForImage(PImage img) {
  // settings for Histogram of Oriented Gradients
  // (probably don't change these)
  int window_width=64;
  int window_height=128;
  int bins = 9;
  int cell_size = 8;
  int block_size = 2;
  boolean signed = false;
  int overlap = 0;
  int stride=16;
  int number_of_resizes=5;
 
  // a bunch of unecessarily verbose HOG code
  HOG_Factory hog = HOG.createInstance();
  GradientsComputation gc=hog.createGradientsComputation();
  Voter voter=MagnitudeItselfVoter.createMagnitudeItselfVoter();
  HistogramsComputation hc=hog.createHistogramsComputation( bins, cell_size, cell_size, signed, voter);
  Norm norm=L2_Norm.createL2_Norm(0.1);
  BlocksComputation bc=hog.createBlocksComputation(block_size, block_size, overlap, norm);
  PixelGradientVector[][] pixelGradients = gc.computeGradients(img, this);
  Histogram[][] histograms = hc.computeHistograms(pixelGradients);
  Block[][] blocks = bc.computeBlocks(histograms);
  Block[][] normalizedBlocks = bc.normalizeBlocks(blocks);
  DescriptorComputation dc=hog.createDescriptorComputation();    
 
  return dc.computeDescriptor(normalizedBlocks);
}