import hashlib import urllib import cv2 from sklearn.metrics.pairwise import cosine_similarity import numpy as np from scipy.stats import pearsonr from matplotlib import pyplot #HOG parameters winSize = (32,32) blockSize = (16,16) blockStride = (8,8) cellSize = (8,8) nbins = 9 derivAperture = 1 winSigma = 4. histogramNormType = 0 L2HysThreshold = 2.0000000000000001e-01 gammaCorrection = 0 nlevels = 64 winStride = (8,8) padding = (8,8) locations = ((10,20),) hog = cv2.HOGDescriptor(winSize,blockSize,blockStride,cellSize,nbins,derivAperture,winSigma, histogramNormType,L2HysThreshold,gammaCorrection,nlevels) #compute(img[, winStride[, padding[, locations]]]) -> descriptors # time it outdir='images/' root='https://upload.wikimedia.org/wikipedia/commons/thumb/' filename_good='correctpairs.csv' filename_bad='wrongpairs.csv' def download_image(filename): #this function downloads an image from the commons repository m = hashlib.md5() m.update(filename) sumfilename=m.hexdigest() first=sumfilename[0] second=sumfilename[:2] #format is: first char of md5 sum / second char / filename / resolutions - filename url=root+first+'/'+second+'/'+filename+'/600px-'+filename urllib.urlretrieve(url, outdir+filename) def evaluate_distance(imagename1,imagename2): #downloadImage(img1) #downloadImage(img2) filename1=outdir+imagename1 filename2=outdir+imagename2 #read images try: img1 = cv2.imread(filename1,0) img2 = cv2.imread(filename2,0) except: return None if img1 is None or img2 is None: return None #compute hog hist1 = hog.compute(img1,winStride,padding,locations) hist2 = hog.compute(img2,winStride,padding,locations) #compute distance #dist=cosine_similarity(hist1,hist2) dist=pearsonr(hist1,hist2) correlation=0 if np.isnan(dist[0]) else dist[0] return correlation def read_file_and_compute(filename): distances=[] count=0 with open(filename) as f: for line in f: row=line[:-1].split('\t') img1=row[1] img2=row[2] d=evaluate_distance(img1,img2) if d is not None: distances.append(d) else: count+=1 print count return distances distances_good=read_file_and_compute(filename_good) distances_bad=read_file_and_compute(filename_bad) print 'good'+str(np.mean(np.asarray(distances_good))) print 'bad'+str(np.mean(np.asarray(distances_bad)))