Teaching my Computer to Read
I have been doing a lot of work with letters and fonts lately in Printing Code class, so I thought it might be interesting to extend some of that into my Makematics project this week. One application of machine learning is identifying text–I read recently that Google Translate wants to introduce a feature that would allow users to submit pictures of words they see and receive translations in response.
I tweaked Greg’s sample code to create a model based on a series of letter images that I made in Photoshop. I made ten training images and ten test images for each of the letters A, B, and C. Some samples of those images appear at the beginning of the video during the training sequence. In the live video example, the computer was able to use that model to figure out what letter I was holding up. It’s not perfect–the letters have to take up a certain percentage of the box to get an accurate reading–but it’s pretty impressive.
Letter Recognizer code (training):
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | /*Kim Ash Makematics Fall 2012 letterRecognizer - Uses a trained Support Vector Machine to letter characters in live video. derived from: HandletterRecognizerInteractive by Greg Borenstein, October 2012 Distributed as part of PSVM: http://makematics.com/code/psvm Depends on HoG Processing: http://hogprocessing.altervista.org/ */ // uses both HoG Processing and Processing-SVM import hog.*; import psvm.*; PImage img; SVM model; int[] labels; // 1 = A, 2 = B, 3 = C String[] trainingFilenames, testFilenames; float[][] trainingFeatures; PImage testImage; double testResult = 0.0; void setup() { size(200, 100); // get the names of all of the files in the "train" folder java.io.File folder = new java.io.File(dataPath("train")); trainingFilenames = folder.list(); // setup labels array with space for labels for each // training file labels = new int[trainingFilenames.length]; trainingFeatures = new float[trainingFilenames.length][324]; // load in the labels based on the first part of // the training images' filenames for (int i = 0; i < trainingFilenames.length; i++) { println(trainingFilenames[i]); // split the filename by "-" and look at the first part // to decide the label String letterLabel = split(trainingFilenames[i], '-')[0]; if (letterLabel.equals("A")) { labels[i] = 1; } if (letterLabel.equals("B")) { labels[i] = 2; } if (letterLabel.equals("C")) { labels[i] = 3; } if (letterLabel.equals("V")) { labels[i] = 4; } if (letterLabel.equals("Five")) { labels[i] = 5; } if (letterLabel.equals("Point")) { labels[i] = 6; } // calculate the Histogram of Oriented Gradients for this image // use its results as a training vector in our SVM trainingFeatures[i] = gradientsForImage(loadImage("train/" + trainingFilenames[i])); } model = new SVM(this); SVMProblem problem = new SVMProblem(); // HoG always gives us back 324 gradients // for a 50x50 image. problem.setNumFeatures(324); // load the problem with the labels and training data problem.setSampleData(labels, trainingFeatures); // train the model model.train(problem); // save our model file as a text file model.saveModel("letter_model.txt"); // get a list of the names of the files in the "test" folder java.io.File testFolder = new java.io.File(dataPath("test")); testFilenames = testFolder.list(); // test a new random image testResult = testNewImage(); } // Function to test a new random image from the test folder // it returns the result of the SVM classification double testNewImage() { // pick a random number between 0 and the number of test images int imgNum = (int)random(0, testFilenames.length-1); // load a test image testImage = loadImage("test/" + testFilenames[imgNum]); return model.test(gradientsForImage(testImage)); } void draw() { background(0); image(testImage, 0, 0); String result = "Letter is: "; // display the name of the letter // in a different color depending on // the result of our SVM test switch((int)testResult) { case 1: fill(255, 125, 125); result = result + "A"; break; case 2: fill(125, 255, 125); result = result + "B"; break; case 3: fill(125, 125, 255); result = result + "C"; break; } text(result, testImage.width + 10, 20); } void keyPressed() { testResult = testNewImage(); } // Use HoG to calculate the gradients for an image. // We'll use this as our feature vector for our SVM. // HoG describes the shape of the object as transitions // between bright and dark pixels. // This function includes a lot of verbose and ugly // code because of the HoG library. float[] gradientsForImage(PImage img) { // resize the images to a consistent size: //img.resize(50, 50); // settings for Histogram of Oriented Gradients // (probably don't change these) int window_width=64; int window_height=128; int bins = 9; int cell_size = 8; int block_size = 2; boolean signed = false; int overlap = 0; int stride=16; int number_of_resizes=5; // a bunch of unecessarily verbose HOG code HOG_Factory hog = HOG.createInstance(); GradientsComputation gc=hog.createGradientsComputation(); Voter voter=MagnitudeItselfVoter.createMagnitudeItselfVoter(); HistogramsComputation hc=hog.createHistogramsComputation( bins, cell_size, cell_size, signed, voter); Norm norm=L2_Norm.createL2_Norm(0.1); BlocksComputation bc=hog.createBlocksComputation(block_size, block_size, overlap, norm); PixelGradientVector[][] pixelGradients = gc.computeGradients(img, this); Histogram[][] histograms = hc.computeHistograms(pixelGradients); Block[][] blocks = bc.computeBlocks(histograms); Block[][] normalizedBlocks = bc.normalizeBlocks(blocks); DescriptorComputation dc=hog.createDescriptorComputation(); return dc.computeDescriptor(normalizedBlocks); } |
Letter Recognizer Video code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 | /*Kim Ash Makematics Fall 2012 letterRecognizerVideo - Trains an SVM-based classifier to detect letters (A, B, C) based on Histogram of Oriented Gradients of letter images. derived from: HandletterRecognizer by Greg Borenstein, October 2012 Distributed as part of PSVM: http://makematics.com/code/psvm Depends on HoG Processing: http://hogprocessing.altervista.org/ */ import hog.*; import psvm.*; import processing.video.*; // Capture object for accessing video feed Capture video; // size of the box we'll be looking in for hand letters int rectW = 150; int rectH = 150; SVM model; PImage testImage; void setup() { size(640/2 + 60, 480/2); // capture video at half size for speed video = new Capture(this, 640/2, 480/2); video.start(); // declare our SVM object model = new SVM(this); // load the trained svm model from the file // our data has 324 dimensions because // that's what we get from doing Histogram of Oriented // Gradients on a 50x50 pixel image model.loadModel("letter_model.txt", 324); // initialize our PImage at 50x50 // we'll use this to display the part // of the video feed we're searching testImage = createImage(50, 50, RGB); } // video event, necessary for getting live camera void captureEvent(Capture c) { c.read(); } void draw() { background(0); // copy the pixels in the incoming video // into our testImage. Only use the pixels // inside the 150x150 square at the center // also resize down to 50x50 (last two arguments) // (the subtractions are to make sure we get the pixels // in our red box) testImage.copy(video, video.width - rectW - (video.width - rectW)/2, video.height - rectH - (video.height - rectH)/2, rectW, rectH, 0, 0, 50, 50); // run Histogram of Oriented Gradients on the testImage // and pass the results to our model for testing double testResult = model.test(gradientsForImage(testImage)); // display the video, the test image, and the red box image(video, 0, 0); image(testImage, width - testImage.width, 0); noFill(); stroke(255, 0, 0); strokeWeight(5); rect(video.width - rectW - (video.width - rectW)/2, video.height - rectH - (video.height - rectH)/2, rectW, rectH); // use the result of our SVM test // to decide what text to put on the screen // based on what letter is showing String result = "Letter is: "; switch((int)testResult) { case 1: fill(255, 125, 125); result = result + "A"; break; case 2: fill(125, 255, 125); result = result + "B"; break; case 3: fill(125, 125, 255); result = result + "C"; break; } text(result, 100, 20); } // Helper function that calculates the // Histogram of Oriented Gradients for // a PImage, filled with a lot of HoG magic float[] gradientsForImage(PImage img) { // settings for Histogram of Oriented Gradients // (probably don't change these) int window_width=64; int window_height=128; int bins = 9; int cell_size = 8; int block_size = 2; boolean signed = false; int overlap = 0; int stride=16; int number_of_resizes=5; // a bunch of unecessarily verbose HOG code HOG_Factory hog = HOG.createInstance(); GradientsComputation gc=hog.createGradientsComputation(); Voter voter=MagnitudeItselfVoter.createMagnitudeItselfVoter(); HistogramsComputation hc=hog.createHistogramsComputation( bins, cell_size, cell_size, signed, voter); Norm norm=L2_Norm.createL2_Norm(0.1); BlocksComputation bc=hog.createBlocksComputation(block_size, block_size, overlap, norm); PixelGradientVector[][] pixelGradients = gc.computeGradients(img, this); Histogram[][] histograms = hc.computeHistograms(pixelGradients); Block[][] blocks = bc.computeBlocks(histograms); Block[][] normalizedBlocks = bc.normalizeBlocks(blocks); DescriptorComputation dc=hog.createDescriptorComputation(); return dc.computeDescriptor(normalizedBlocks); } |