고문헌의 textualis로 쓰인 글자를 인식하기 위해서 사용한 신경망입니다. https://medium.com/@akashg/character-recognition-using-tensorflow-a93dbbdf4af 코드를 참고하여 작성하였구요, tensorflow와 scikit-learn이 설치되어 있어야 작동합니다.
신경망의 구조는 다음과 같습니다.
INPUT (크기: nImgSize * nImgSize)
↓ activation: sigmoid
LAYER1 (크기: layer1)
↓ activation: sigmoid
LAYER2 (크기: layer2)
↓ activation: softmax
OUTPUT (크기: nClasses)
고문헌 라틴어를 인식하는 작업에 imgSize = 32, layer1 = 800, layer2 = 200로 설정하고, 총 100여가지의 문자를 분류해보았는데, 약 91%의 정확도를 보였습니다.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
import tensorflow as tf import numpy, os, re, random, pickle from skimage import io, transform from sklearn.model_selection import KFold class NNOCRModel: LEARNING_RATE = 0.001 BATCH_SIZE = 100 DISPLAY_STEP = 10 # layer1과 layer2는 히든 레이어의 크기입니다. # imgSize는 입력 받을 이미지의 크기입니다 def __init__( self , layer1, layer2, imgSize): self .nLayer1 = layer1 self .nLayer2 = layer2 self .nImgSize = imgSize self .nInput = imgSize * * 2 self .nClasses = 0 self .oNames = [] self .xhat = None self .yhat = None self .sess = None def __del__( self ): if self .sess : self .sess.close() @staticmethod def restoreName(s): return re.sub( '0x([0-9a-fA-F]{4})' , lambda m: chr ( int (m.group( 1 ), 16 )), s) def loadImg( self , path): image = io.imread(path) image = numpy.invert(image) image = numpy.average(image, axis = 2 ) xs = transform.resize(image, ( self .nImgSize, self .nImgSize)) xs = numpy.reshape(xs, self .nInput).astype( float ) xs / = numpy. max (xs) return xs # datasetFolder 내부로부터 이미지를 읽어들입니다. # datasetFolder 안에는 라벨이름으로 된 폴더가 있어야 하고, 그 안에 이미지 파일이 있으면 됩니다. # 예시: dataset/a/001.png, dataset/a/002.png, dataset/b/001.png, dataset/b/002.png # 라벨별로 최대 maxAugemtation 개수까지 이미지 augmentation을 실시합니다. def prepareData( self , datasetFolder, maxAugmentation = 150 ): folders = [] for d in os.listdir(datasetFolder): if os.path.isdir(datasetFolder + "/" + d): numImg = len ([img for img in os.listdir(datasetFolder + "/" + d) if os.path.isfile(datasetFolder + "/" + d + "/" + img)]) if numImg < 3 : continue folders.append((d, numImg)) self .nClasses = len (folders) self .oNames = [] data = [] for n, (folder, numImg) in enumerate (folders): oname = NNOCRModel.restoreName(folder) self .oNames.append(oname) augmentation = numImg < maxAugmentation augRatio = maxAugmentation / numImg if augRatio > 21 : degrees = list ( range ( - 8 , 9 , 2 )) elif augRatio > 15 : degrees = list ( range ( - 8 , 9 , 3 )) elif augRatio > 9 : degrees = list ( range ( - 8 , 9 , 4 )) elif augRatio > 3 : degrees = list ( range ( - 8 , 9 , 8 )) else : degrees = [ 2 ] for img in os.listdir(datasetFolder + "/" + folder): path = datasetFolder + "/" + folder + "/" + img if not os.path.isfile(path): continue image = io.imread(path) image = numpy.invert(image) image = numpy.average(image, axis = 2 ) xs = transform.resize(image, ( self .nImgSize, self .nImgSize)) xs = numpy.reshape(xs, self .nInput).astype( float ) xs / = numpy. max (xs) y = numpy.zeros(( self .nClasses,), dtype = float ) y[n] = 1 data.append((xs, y)) if augmentation: for scale in [ 0.9 , 1.0 , 1.1 ]: for degree in degrees: if scale = = 1.0 and degree = = 0 : continue if numImg > = maxAugmentation: break xs = transform.resize(transform.rescale(transform.rotate(image, degree), scale), ( self .nImgSize, self .nImgSize)) xs = numpy.reshape(xs, self .nInput).astype( float ) xs / = numpy. max (xs) data.append((xs, y)) numImg + = 1 random.seed( 1 ) random.shuffle(data) print ( self .oNames) print ( 'Total images: ' , len (data)) self .xhat = numpy.asarray([d[ 0 ] for d in data]) self .yhat = numpy.asarray([d[ 1 ] for d in data]) # 신경망을 생성합니다. def buildGraph( self ): self .x = tf.placeholder(tf.float32, [ None , self .nInput]) self .y = tf.placeholder(tf.float32, [ None , self .nClasses]) weights = { 'h1' : tf.Variable(tf.random_normal([ self .nInput, self .nLayer1], stddev = 0.01 ), dtype = tf.float32), 'h2' : tf.Variable(tf.random_normal([ self .nLayer1, self .nLayer2], stddev = 0.01 ), dtype = tf.float32), 'out' : tf.Variable(tf.random_normal([ self .nLayer2, self .nClasses], stddev = 0.01 ), dtype = tf.float32) } biases = { 'b1' : tf.Variable(tf.random_normal([ self .nLayer1], stddev = 0.01 ), dtype = tf.float32), 'b2' : tf.Variable(tf.random_normal([ self .nLayer2], stddev = 0.01 ), dtype = tf.float32), 'out' : tf.Variable(tf.random_normal([ self .nClasses], stddev = 0.01 ), dtype = tf.float32) } layer_1 = tf.add(tf.matmul( self .x, weights[ 'h1' ]), biases[ 'b1' ]) layer_1 = tf.nn.sigmoid(layer_1) layer_2 = tf.add(tf.matmul(layer_1, weights[ 'h2' ]), biases[ 'b2' ]) layer_2 = tf.nn.sigmoid(layer_2) self .pred = tf.nn.softmax(tf.matmul(layer_2, weights[ 'out' ]) + biases[ 'out' ]) self .cost = - tf.reduce_sum( self .pred * tf.log( self .y + 1e - 10 )) \ + 0.001 * ( tf.nn.l2_loss(weights[ 'h1' ]) + tf.nn.l2_loss(weights[ 'h2' ]) + tf.nn.l2_loss(weights[ 'out' ]) + tf.nn.l2_loss(biases[ 'b1' ]) + tf.nn.l2_loss(biases[ 'b2' ]) + tf.nn.l2_loss(biases[ 'out' ])) self .optimizer = tf.train.AdamOptimizer(learning_rate = NNOCRModel.LEARNING_RATE).minimize( self .cost) # 전체 데이터를 train과 test 데이터로 분할하여K cross validation을 실시합니다. def validateKFold( self , n = 5 , epochs = 500 ): accuracies = [] kf = KFold(n_splits = 5 ) for train, test in kf.split( self .xhat): trainX, trainY = self .xhat[train], self .yhat[train] testX, testY = self .xhat[test], self .yhat[test] init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) for epoch in range (epochs): avg_cost = 0. total_batch = len (trainX) / / NNOCRModel.BATCH_SIZE for i in range (total_batch): batchX = trainX[i * len (trainX) / / total_batch : (i + 1 ) * len (trainX) / / total_batch] batchY = trainY[i * len (trainX) / / total_batch : (i + 1 ) * len (trainX) / / total_batch] _, c = sess.run([ self .optimizer, self .cost], feed_dict = { self .x: batchX, self .y: batchY}) avg_cost + = c / total_batch if epoch % NNOCRModel.DISPLAY_STEP = = 0 : print ( "Epoch:" , '%04d' % (epoch + 1 ), "cost=" , "{:.9f}" . format (avg_cost)) print ( "Optimization Finished!" ) correct_prediction = tf.equal(tf.argmax( self .pred, 1 ), tf.argmax( self .y, 1 )) accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float" )) accu = accuracy. eval ({ self .x: testX, self .y: testY}) print ( "Accuracy:" , accu) accuracies.append(accu) return accuracies def _saveParams( self , path): with open (path, 'wb' ) as f: pickle.dump(( self .nLayer1, self .nLayer2, self .nImgSize, self .nInput, self .nClasses, self .oNames), f) def _loadParams( self , path): with open (path, 'rb' ) as f: self .nLayer1, self .nLayer2, self .nImgSize, self .nInput, self .nClasses, self .oNames = pickle.load(f) # 전체 데이터를 이용해 train을 실시합니다. def train( self , epochs = 500 ): trainX, trainY = self .xhat, self .yhat init = tf.global_variables_initializer() self .sess = tf.Session() self .sess.run(init) for epoch in range (epochs): avg_cost = 0. total_batch = len (trainX) / / NNOCRModel.BATCH_SIZE for i in range (total_batch): batchX = trainX[i * len (trainX) / / total_batch: (i + 1 ) * len (trainX) / / total_batch] batchY = trainY[i * len (trainX) / / total_batch: (i + 1 ) * len (trainX) / / total_batch] _, c = self .sess.run([ self .optimizer, self .cost], feed_dict = { self .x: batchX, self .y: batchY}) avg_cost + = c / total_batch if epoch % NNOCRModel.DISPLAY_STEP = = 0 : print ( "Epoch:" , '%04d' % (epoch + 1 ), "cost=" , "{:.9f}" . format (avg_cost)) print ( "Optimization Finished!" ) def _saveModel( self , path): saver = tf.train.Saver() saver.save( self .sess, path) def _loadModel( self , path): saver = tf.train.Saver() self .sess = tf.Session() saver.restore( self .sess, path) def save( self , path): self ._saveParams(path + '.pickle' ) self ._saveModel(path) @staticmethod def load(path): inst = NNOCRModel( 0 , 0 , 0 ) inst._loadParams(path + '.pickle' ) inst.buildGraph() inst._loadModel(path) return inst def predictByRawImg( self , imageArr, dictType = False ): dataArr = [] for img in imageArr: img = transform.resize(img, ( self .nImgSize, self .nImgSize)) img = numpy.reshape(numpy.invert(img), self .nInput).astype( float ) img / = max (numpy. max (img), 32 ) dataArr.append(img) return self .predict(numpy.array(dataArr), dictType) def predict( self , dataArr, dictType = False ): predictions = self .pred. eval ({ self .x: dataArr}, session = self .sess) if dictType: predictions = [ dict ( zip ( self .oNames, p)) for p in predictions] return predictions if __name__ = = '__main__' : ocr = NNOCRModel( 800 , 200 , 32 ) ocr.prepareData( 'labeledChr/' , 400 ) ocr.buildGraph() #성능 평가는 다음과 같이 #accs = ocr.validateKFold(5, 400) #print("Avg Accuracy: %g" % (sum(accs) / len(accs))) ocr.train( 600 ) ocr.save( './model' ) # 로딩은 다음과 같이 #ocr = NNOCRModel.load('./model') #ocr.predict(numpy.random.rand(1, 1024)) |
Tensorflow는 정말 간단하면서도 강력한 도구인듯합니다. 이렇게 간단하게 문자인식을 할수 있다니, 확실히 요즘 대세가 되는 이유가 있군요.
[Python] tomotopy로 쉽게 토픽 모델링 실시하기 (59) | 2019.05.22 |
---|---|
Python용 토픽 모델링 패키지 - tomotopy 개발 (12) | 2019.05.19 |
[Kiwi] 지능형 한국어 형태소 분석기 0.6버전 업데이트 (1) | 2018.12.09 |
AdaGram : 동음이의어를 구분하는 Word2Vec (1) | 2018.09.23 |
Word2Vec을 이용한 한국어 관련어 네트워크 보기 (0) | 2018.09.17 |
Kneser-Ney 언어 모형을 활용한 한국어 초성체 해석기 개발 (0) | 2018.06.17 |
댓글 영역