funMV: Python에서 sqlite를 이용한 BoVW 구현

# Python에서 sqlite를 이용한 BoVW 구현
# funMV, 2014.12
from numpy import *
import pickle
from pysqlite2 import dbapi2 as sqlite
import sift
import imagesearch

f = open("ukbench/first1000/list.txt",'r') # to access sub-directory
lines = f.readlines() # read all lines through line to line style
f.close()
imlist=[ 'ukbench/first1000/'+line[:-1] for line in lines] # to eliminate last character '\n'

nbr_images=len(imlist)
featlist=[ imlist[i][:-3]+'sift' for i in range(nbr_images)] # filename.sift in each line

# load vocabulary
with open('vocabulary.pkl','rb') as f:
voc = pickle.load(f)

# create db
con = sqlite.connect('test1.db') # 재 실행시, 반드시 test1.db를 지우고 돌려야 함
# 일단, 실행되면 hdd에 test1.db가 저장되기 때문

# create tables
con.execute('create table imlist(filename)')
con.execute('create table imwords(imid,wordid,vocname)')
con.execute('create table imhistograms(imid,histogram,vocname)')

# 다음 4개 명령은 없어도 실행되지만 좀 느려지는 것 같음
con.execute('create index im_idx on imlist(filename)')
con.execute('create index wordid_idx on imwords(wordid)')
con.execute('create index imid_idx on imwords(imid)')
con.execute('create index imidhist_idx on imhistograms(imid)')

con.commit()

# test db
locs, descr = sift.read_features_from_file(featlist[0])
# locs=2276x4, descr=2276x128
# For first image, 2277 features are there and they will be prjected to vw

imwords=voc.project(descr)
#voc.shape[0]=498: # of visual words
#imwords.shape=498의 히스토그램(보팅) 정보
#imwords: voting number of features per each word (histogram). some features among 2278 features
# are voted 7 times to first word, and 6 times for second words of vw, so on.
#array([ 7., 6., 2., 1., 5., 4., 4., 1., 0., 4., 2.,
# 3., 6., 1., 2., 4., 2., 0., 1., 9., 1., 1.,
# 2., 3., 0., 1., 7., 3., 2., 7., 3., 0., 5.,
# 17., 1., 3., 16., 6., 3., 8., 26., 11., 1., 10.,
# 3., 3., 4., 2., 2., 1., 2., 1., 2., 2., ...
nbr_words=imwords.shape[0] # 498

# 위는 test모드이고 여기서부터는 실제 모든 im의 feature들을 db에 삽입
# go through all images, project features on vocabulary and insert
for i in range(nbr_images)[:100]: # [0,1,2,...,98,99]
locs, descr = sift.read_features_from_file(featlist[i])
imname = imlist[i]

imwords=voc.project(descr)
nbr_words=imwords.shape[0]

# (1) 파일 이름을 db에 저장
cur=con.execute("insert into imlist(filename) values ('%s')" %imname)
imid = cur.lastrowid

# (2) 파일 이름 id - 각 word에 대한 voting 횟수 연계 저장
for j in range(nbr_words):
word = imwords[j]
con.execute("insert into imwords(imid,wordid,vocname) values (?,?,?)",
(imid,word,voc.name))

# (3) 파일이름 id와 히스토그램 전체 저장
con.execute("insert into imhistograms(imid,histogram,vocname) values (?,?,?)",
(imid,pickle.dumps(imwords),voc.name))

# 여기서 최종 결과를 저장하고 나가려면 commit를 해 주여야 함.
# con.commit()
# 다시 사용 시
# con=sqlite.connect('test1.db')
#

# Test for saved db
print con.execute('select count (filename) from imlist').fetchone()
# (100,), 100개의 im name이 저장

print con.execute('select * from imlist').fetchone()
# (u'ukbench/first1000/ukbench00000.jpg',)

##################################################
# 여기서 부터 저장된 db를 이용한 test
##################################################

# test할 query인 첫번째 im의 id, 히스토그램을 가져옴
im_id = con.execute("select rowid from imlist where filename='%s'" % imlist[0]).fetchone()
#im_id=(1,)
s = con.execute("select histogram from imhistograms where rowid='%d'" % im_id).fetchone()
h = pickle.loads(str(s[0])) # len(.)=498, histogram for word voting

#Using the index to get candidates
#locs, descr = sift.read_features_from_file(featlist[0])
#imwords=voc.project(descr)
#words=imwords.nonzero()[0] #voting이 하나라도 있는 words의 index

words=h.nonzero()[0]
#vw에 대한 histogram(bin수는 보팅 횟수가 0인 빈을 배제 후 vw의 갯수 만큼)
# words.shape=(455,)=[0,1,2,3,5,....]

# find candidates
candidates = []
for word in words:
# table imword에서 word id로 imid를 추출. 즉, 특정 word를 가진 모든 im의 id를 추출
# 즉, query im의 해당 word를 가지는 db내 모든 im의 id 리스트를 candidates에 저장
im_ids = con.execute("select distinct imid from imwords where wordid=%d"
% word).fetchall()
c = [i[0] for i in im_ids]
candidates += c

# len(candidates) = 1443

# take all unique words and reverse sort on occurrence
tmp = [(w,candidates.count(w)) for w in set(candidates)]
# candidates.count(1)=23, candidates.count(10)=15
# set(candidates)=[1,2,3,4,...,99,100]
# tmp=[(1, 23), (2, 23), (3, 19), (4, 26), (5, 11), (6, 13), (7, 14),
# (8, 11), (9, 28), (10, 15), (11, 14), (12, 30), (13, 10),.....
# (95, 27), (96, 31), (97, 19), (98, 16), (99, 18), (100, 17)]
tmp.sort(cmp=lambda x,y:cmp(x[1],y[1]))
tmp.reverse()
candi=[w[0] for w in tmp] # len(candi)=100
#candi=[43,77,44,42,78,79,...,21,84,83]
#필요한 것은 im의 id이므로 sort후 reverse해줌

matchscores=[]
for imid in candi:
s = con.execute("select histogram from imhistograms where rowid='%d'" % imid).fetchone()
cand_h = pickle.loads(str(s[0])) # histogram for word voting

cand_dist = sqrt( sum( voc.idf*(h-cand_h)**2 ) )
matchscores.append( (cand_dist,imid) )

matchscores.sort()
print matchscores[:10]
#[(0.0, 1), (60.812088499474271, 2), (61.547483004618186, 3), (92.620967753952812, 4),
# (100.59065889285603, 34), (107.76370948763174, 28), (108.27892205906744, 25),
# (109.39719124624605, 9), (110.33866766043165, 10), (110.77231202013482, 20)]

con.commit()
con.close()

funMV

2014년 11월 30일 일요일

Python에서 sqlite를 이용한 BoVW 구현

댓글 없음:

댓글 쓰기

태그

프로필