00001
00019 #ifndef __D_T_TEMPLATED_VOCABULARY__
00020 #define __D_T_TEMPLATED_VOCABULARY__
00021
00022 #include <cassert>
00023
00024 #include <vector>
00025 #include <numeric>
00026 #include <fstream>
00027 #include <string>
00028 #include <algorithm>
00029 #include <opencv/cv.h>
00030
00031 #include "FeatureVector.h"
00032 #include "BowVector.h"
00033 #include "ScoringObject.h"
00034
00035 #include "DUtils.h"
00036
00037 using namespace std;
00038
00039 namespace DBoW2 {
00040
00043 template<class TDescriptor, class F>
00045 class TemplatedVocabulary
00046 {
00047 public:
00048
00056 TemplatedVocabulary(int k = 10, int L = 5,
00057 WeightingType weighting = TF_IDF, ScoringType scoring = L1_NORM);
00058
00063 TemplatedVocabulary(const std::string &filename);
00064
00069 TemplatedVocabulary(const char *filename);
00070
00075 TemplatedVocabulary(const TemplatedVocabulary<TDescriptor, F> &voc);
00076
00080 virtual ~TemplatedVocabulary();
00081
00088 TemplatedVocabulary<TDescriptor, F>& operator=(
00089 const TemplatedVocabulary<TDescriptor, F> &voc);
00090
00096 virtual void create
00097 (const std::vector<std::vector<TDescriptor> > &training_features);
00098
00106 virtual void create
00107 (const std::vector<std::vector<TDescriptor> > &training_features,
00108 int k, int L);
00109
00115 virtual void create
00116 (const std::vector<std::vector<TDescriptor> > &training_features,
00117 int k, int L, WeightingType weighting, ScoringType scoring);
00118
00123 virtual inline unsigned int size() const;
00124
00129 virtual inline bool empty() const;
00130
00136 virtual void transform(const std::vector<TDescriptor>& features, BowVector &v)
00137 const;
00138
00146 virtual void transform(const std::vector<TDescriptor>& features,
00147 BowVector &v, FeatureVector &fv, int levelsup) const;
00148
00154 virtual WordId transform(const TDescriptor& feature) const;
00155
00163 inline double score(const BowVector &a, const BowVector &b) const;
00164
00172 virtual NodeId getParentNode(WordId wid, int levelsup) const;
00173
00180 void getWordsFromNode(NodeId nid, std::vector<WordId> &words) const;
00181
00186 inline int getBranchingFactor() const { return m_k; }
00187
00192 inline int getDepthLevels() const { return m_L; }
00193
00198 float getEffectiveLevels() const;
00199
00205 virtual inline TDescriptor getWord(WordId wid) const;
00206
00212 virtual inline WordValue getWordWeight(WordId wid) const;
00213
00218 inline WeightingType getWeightingType() const { return m_weighting; }
00219
00224 inline ScoringType getScoringType() const { return m_scoring; }
00225
00230 inline void setWeightingType(WeightingType type);
00231
00236 void setScoringType(ScoringType type);
00237
00242 void save(const std::string &filename) const;
00243
00248 void load(const std::string &filename);
00249
00254 virtual void save(cv::FileStorage &fs,
00255 const std::string &name = "vocabulary") const;
00256
00263 virtual void load(const cv::FileStorage &fs,
00264 const std::string &name = "vocabulary");
00265
00278 virtual int stopWords(double minWeight);
00279
00280 protected:
00281
00283 typedef const TDescriptor *pDescriptor;
00284
00286 struct Node
00287 {
00289 NodeId id;
00291 WordValue weight;
00293 vector<NodeId> children;
00295 NodeId parent;
00297 TDescriptor descriptor;
00298
00300 WordId word_id;
00301
00305 Node(): id(0), weight(0), parent(0), word_id(0){}
00306
00311 Node(NodeId _id): id(_id), weight(0), parent(0), word_id(0){}
00312
00317 inline bool isLeaf() const { return children.empty(); }
00318 };
00319
00320 protected:
00321
00325 void createScoringObject();
00326
00332 void getFeatures(
00333 const vector<vector<TDescriptor> > &training_features,
00334 vector<pDescriptor> &features) const;
00335
00344 virtual void transform(const TDescriptor &feature,
00345 WordId &id, WordValue &weight, NodeId* nid = NULL, int levelsup = 0) const;
00346
00352 virtual void transform(const TDescriptor &feature, WordId &id) const;
00353
00361 void HKmeansStep(NodeId parent_id, const vector<pDescriptor> &descriptors,
00362 int current_level);
00363
00369 virtual void initiateClusters(const vector<pDescriptor> &descriptors,
00370 vector<TDescriptor> &clusters) const;
00371
00378 void initiateClustersKMpp(const vector<pDescriptor> &descriptors,
00379 vector<TDescriptor> &clusters) const;
00380
00384 void createWords();
00385
00392 void setNodeWeights(const vector<vector<TDescriptor> > &features);
00393
00394 protected:
00395
00397 int m_k;
00398
00400 int m_L;
00401
00403 WeightingType m_weighting;
00404
00406 ScoringType m_scoring;
00407
00409 GeneralScoring* m_scoring_object;
00410
00412 std::vector<Node> m_nodes;
00413
00416 std::vector<Node*> m_words;
00417
00418 };
00419
00420
00421
00422 template<class TDescriptor, class F>
00423 TemplatedVocabulary<TDescriptor,F>::TemplatedVocabulary
00424 (int k, int L, WeightingType weighting, ScoringType scoring)
00425 : m_k(k), m_L(L), m_weighting(weighting), m_scoring(scoring),
00426 m_scoring_object(NULL)
00427 {
00428 createScoringObject();
00429 }
00430
00431
00432
00433 template<class TDescriptor, class F>
00434 TemplatedVocabulary<TDescriptor,F>::TemplatedVocabulary
00435 (const std::string &filename): m_scoring_object(NULL)
00436 {
00437 load(filename);
00438 }
00439
00440
00441
00442 template<class TDescriptor, class F>
00443 TemplatedVocabulary<TDescriptor,F>::TemplatedVocabulary
00444 (const char *filename): m_scoring_object(NULL)
00445 {
00446 load(filename);
00447 }
00448
00449
00450
00451 template<class TDescriptor, class F>
00452 void TemplatedVocabulary<TDescriptor,F>::createScoringObject()
00453 {
00454 delete m_scoring_object;
00455 m_scoring_object = NULL;
00456
00457 switch(m_scoring)
00458 {
00459 case L1_NORM:
00460 m_scoring_object = new L1Scoring;
00461 break;
00462
00463 case L2_NORM:
00464 m_scoring_object = new L2Scoring;
00465 break;
00466
00467 case CHI_SQUARE:
00468 m_scoring_object = new ChiSquareScoring;
00469 break;
00470
00471 case KL:
00472 m_scoring_object = new KLScoring;
00473 break;
00474
00475 case BHATTACHARYYA:
00476 m_scoring_object = new BhattacharyyaScoring;
00477 break;
00478
00479 case DOT_PRODUCT:
00480 m_scoring_object = new DotProductScoring;
00481 break;
00482
00483 }
00484 }
00485
00486
00487
00488 template<class TDescriptor, class F>
00489 void TemplatedVocabulary<TDescriptor,F>::setScoringType(ScoringType type)
00490 {
00491 m_scoring = type;
00492 createScoringObject();
00493 }
00494
00495
00496
00497 template<class TDescriptor, class F>
00498 void TemplatedVocabulary<TDescriptor,F>::setWeightingType(WeightingType type)
00499 {
00500 this->m_weighting = type;
00501 }
00502
00503
00504
00505 template<class TDescriptor, class F>
00506 TemplatedVocabulary<TDescriptor,F>::TemplatedVocabulary(
00507 const TemplatedVocabulary<TDescriptor, F> &voc)
00508 : m_scoring_object(NULL)
00509 {
00510 *this = voc;
00511 }
00512
00513
00514
00515 template<class TDescriptor, class F>
00516 TemplatedVocabulary<TDescriptor,F>::~TemplatedVocabulary()
00517 {
00518 delete m_scoring_object;
00519 }
00520
00521
00522
00523 template<class TDescriptor, class F>
00524 TemplatedVocabulary<TDescriptor, F>&
00525 TemplatedVocabulary<TDescriptor,F>::operator=
00526 (const TemplatedVocabulary<TDescriptor, F> &voc)
00527 {
00528 this->m_k = voc.m_k;
00529 this->m_L = voc.m_L;
00530 this->m_scoring = voc.m_scoring;
00531 this->m_weighting = voc.m_weighting;
00532
00533 this->createScoringObject();
00534
00535 this->m_nodes.clear();
00536 this->m_words.clear();
00537
00538 this->m_nodes = voc.m_nodes;
00539 this->createWords();
00540
00541 return *this;
00542 }
00543
00544
00545
00546 template<class TDescriptor, class F>
00547 void TemplatedVocabulary<TDescriptor,F>::create(
00548 const std::vector<std::vector<TDescriptor> > &training_features)
00549 {
00550 m_nodes.clear();
00551 m_words.clear();
00552
00553
00554 int expected_nodes =
00555 (int)((pow((double)m_k, (double)m_L + 1) - 1)/(m_k - 1));
00556
00557 m_nodes.reserve(expected_nodes);
00558
00559
00560 vector<pDescriptor> features;
00561 getFeatures(training_features, features);
00562
00563
00564
00565 m_nodes.push_back(Node(0));
00566
00567
00568 HKmeansStep(0, features, 1);
00569
00570
00571 createWords();
00572
00573
00574 setNodeWeights(training_features);
00575
00576 }
00577
00578
00579
00580 template<class TDescriptor, class F>
00581 void TemplatedVocabulary<TDescriptor,F>::create(
00582 const std::vector<std::vector<TDescriptor> > &training_features,
00583 int k, int L)
00584 {
00585 m_k = k;
00586 m_L = L;
00587
00588 create(training_features);
00589 }
00590
00591
00592
00593 template<class TDescriptor, class F>
00594 void TemplatedVocabulary<TDescriptor,F>::create(
00595 const std::vector<std::vector<TDescriptor> > &training_features,
00596 int k, int L, WeightingType weighting, ScoringType scoring)
00597 {
00598 m_k = k;
00599 m_L = L;
00600 m_weighting = weighting;
00601 m_scoring = scoring;
00602 createScoringObject();
00603
00604 create(training_features);
00605 }
00606
00607
00608
00609 template<class TDescriptor, class F>
00610 void TemplatedVocabulary<TDescriptor,F>::getFeatures(
00611 const vector<vector<TDescriptor> > &training_features,
00612 vector<pDescriptor> &features) const
00613 {
00614 features.resize(0);
00615
00616 typename vector<vector<TDescriptor> >::const_iterator vvit;
00617 typename vector<TDescriptor>::const_iterator vit;
00618 for(vvit = training_features.begin(); vvit != training_features.end(); ++vvit)
00619 {
00620 features.reserve(features.size() + vvit->size());
00621 for(vit = vvit->begin(); vit != vvit->end(); ++vit)
00622 {
00623 features.push_back(&(*vit));
00624 }
00625 }
00626 }
00627
00628
00629
00630 template<class TDescriptor, class F>
00631 void TemplatedVocabulary<TDescriptor,F>::HKmeansStep(NodeId parent_id,
00632 const vector<pDescriptor> &descriptors, int current_level)
00633 {
00634 if(descriptors.empty()) return;
00635
00636
00637 vector<TDescriptor> clusters;
00638 vector<vector<unsigned int> > groups;
00639
00640
00641 clusters.reserve(m_k);
00642 groups.reserve(m_k);
00643
00644
00645
00646
00648
00649 if((int)descriptors.size() <= m_k)
00650 {
00651
00652 groups.resize(descriptors.size());
00653
00654 for(unsigned int i = 0; i < descriptors.size(); i++)
00655 {
00656 groups[i].push_back(i);
00657 clusters.push_back(*descriptors[i]);
00658 }
00659 }
00660 else
00661 {
00662
00663
00664 bool first_time = true;
00665 bool goon = true;
00666
00667
00668 vector<int> last_association, current_association;
00669
00670 while(goon)
00671 {
00672
00673
00674 if(first_time)
00675 {
00676
00677 initiateClusters(descriptors, clusters);
00678 }
00679 else
00680 {
00681
00682
00683 for(unsigned int c = 0; c < clusters.size(); ++c)
00684 {
00685 vector<pDescriptor> cluster_descriptors;
00686 cluster_descriptors.reserve(groups[c].size());
00687
00688
00689
00690
00691
00692
00693
00694
00695
00696
00697
00698 vector<unsigned int>::const_iterator vit;
00699 for(vit = groups[c].begin(); vit != groups[c].end(); ++vit)
00700 {
00701 cluster_descriptors.push_back(descriptors[*vit]);
00702 }
00703
00704
00705 F::meanValue(cluster_descriptors, clusters[c]);
00706 }
00707
00708 }
00709
00710
00711
00712
00713 groups.clear();
00714 groups.resize(clusters.size(), vector<unsigned int>());
00715 current_association.resize(descriptors.size());
00716
00717
00718
00719 typename vector<pDescriptor>::const_iterator fit;
00720
00721 for(fit = descriptors.begin(); fit != descriptors.end(); ++fit)
00722 {
00723 double best_dist = F::distance(*(*fit), clusters[0]);
00724 unsigned int icluster = 0;
00725
00726 for(unsigned int c = 1; c < clusters.size(); ++c)
00727 {
00728 double dist = F::distance(*(*fit), clusters[c]);
00729 if(dist < best_dist)
00730 {
00731 best_dist = dist;
00732 icluster = c;
00733 }
00734 }
00735
00736
00737
00738 groups[icluster].push_back(fit - descriptors.begin());
00739 current_association[ fit - descriptors.begin() ] = icluster;
00740 }
00741
00742
00743
00744
00745 if(first_time)
00746 {
00747 first_time = false;
00748 }
00749 else
00750 {
00751
00752
00753 goon = false;
00754 for(unsigned int i = 0; i < current_association.size(); i++)
00755 {
00756 if(current_association[i] != last_association[i]){
00757 goon = true;
00758 break;
00759 }
00760 }
00761 }
00762
00763 if(goon)
00764 {
00765
00766 last_association = current_association;
00767
00768 }
00769
00770 }
00771
00772 }
00773
00774
00775 for(unsigned int i = 0; i < clusters.size(); ++i)
00776 {
00777 NodeId id = m_nodes.size();
00778 m_nodes.push_back(Node(id));
00779 m_nodes.back().descriptor = clusters[i];
00780 m_nodes.back().parent = parent_id;
00781 m_nodes[parent_id].children.push_back(id);
00782 }
00783
00784
00785 if(current_level < m_L)
00786 {
00787
00788 const vector<NodeId> &children_ids = m_nodes[parent_id].children;
00789 for(unsigned int i = 0; i < clusters.size(); ++i)
00790 {
00791 NodeId id = children_ids[i];
00792
00793 vector<pDescriptor> child_features;
00794 child_features.reserve(groups[i].size());
00795
00796 vector<unsigned int>::const_iterator vit;
00797 for(vit = groups[i].begin(); vit != groups[i].end(); ++vit)
00798 {
00799 child_features.push_back(descriptors[*vit]);
00800 }
00801
00802 if(child_features.size() > 1)
00803 {
00804 HKmeansStep(id, child_features, current_level + 1);
00805 }
00806 }
00807 }
00808 }
00809
00810
00811
00812 template<class TDescriptor, class F>
00813 void TemplatedVocabulary<TDescriptor, F>::initiateClusters
00814 (const vector<pDescriptor> &descriptors, vector<TDescriptor> &clusters) const
00815 {
00816 initiateClustersKMpp(descriptors, clusters);
00817 }
00818
00819
00820
00821 template<class TDescriptor, class F>
00822 void TemplatedVocabulary<TDescriptor,F>::initiateClustersKMpp(
00823 const vector<pDescriptor> &pfeatures, vector<TDescriptor> &clusters) const
00824 {
00825
00826
00827
00828
00829
00830
00831
00832
00833
00834
00835
00836 DUtils::Random::SeedRandOnce();
00837
00838 clusters.resize(0);
00839 clusters.reserve(m_k);
00840 vector<double> min_dists(pfeatures.size(), std::numeric_limits<double>::max());
00841
00842
00843
00844 int ifeature = DUtils::Random::RandomInt(0, pfeatures.size()-1);
00845
00846
00847 clusters.push_back(*pfeatures[ifeature]);
00848
00849
00850 typename vector<pDescriptor>::const_iterator fit;
00851 vector<double>::iterator dit;
00852 dit = min_dists.begin();
00853 for(fit = pfeatures.begin(); fit != pfeatures.end(); ++fit, ++dit)
00854 {
00855 *dit = F::distance(*(*fit), clusters.back());
00856 }
00857
00858 while((int)clusters.size() < m_k)
00859 {
00860
00861 dit = min_dists.begin();
00862 for(fit = pfeatures.begin(); fit != pfeatures.end(); ++fit, ++dit)
00863 {
00864 if(*dit > 0)
00865 {
00866 double dist = F::distance(*(*fit), clusters.back());
00867 if(dist < *dit) *dit = dist;
00868 }
00869 }
00870
00871
00872 double dist_sum = std::accumulate(min_dists.begin(), min_dists.end(), 0.0);
00873
00874 if(dist_sum > 0)
00875 {
00876 double cut_d;
00877 do
00878 {
00879 cut_d = DUtils::Random::RandomValue<double>(0, dist_sum);
00880 } while(cut_d == 0.0);
00881
00882 double d_up_now = 0;
00883 for(dit = min_dists.begin(); dit != min_dists.end(); ++dit)
00884 {
00885 d_up_now += *dit;
00886 if(d_up_now >= cut_d) break;
00887 }
00888
00889 if(dit == min_dists.end())
00890 ifeature = pfeatures.size()-1;
00891 else
00892 ifeature = dit - min_dists.begin();
00893
00894 clusters.push_back(*pfeatures[ifeature]);
00895
00896 }
00897 else
00898 break;
00899
00900 }
00901
00902 }
00903
00904
00905
00906 template<class TDescriptor, class F>
00907 void TemplatedVocabulary<TDescriptor,F>::createWords()
00908 {
00909 m_words.resize(0);
00910
00911 if(!m_nodes.empty())
00912 {
00913 m_words.reserve( (int)pow((double)m_k, (double)m_L) );
00914
00915 typename vector<Node>::iterator nit;
00916
00917 nit = m_nodes.begin();
00918 for(++nit; nit != m_nodes.end(); ++nit)
00919 {
00920 if(nit->isLeaf())
00921 {
00922 nit->word_id = m_words.size();
00923 m_words.push_back( &(*nit) );
00924 }
00925 }
00926 }
00927 }
00928
00929
00930
00931 template<class TDescriptor, class F>
00932 void TemplatedVocabulary<TDescriptor,F>::setNodeWeights
00933 (const vector<vector<TDescriptor> > &training_features)
00934 {
00935 const unsigned int NWords = m_words.size();
00936 const unsigned int NDocs = training_features.size();
00937
00938 if(m_weighting == TF || m_weighting == BINARY)
00939 {
00940
00941 for(unsigned int i = 0; i < NWords; i++)
00942 m_words[i]->weight = 1;
00943 }
00944 else if(m_weighting == IDF || m_weighting == TF_IDF)
00945 {
00946
00947
00948
00949
00950
00951 vector<unsigned int> Ni(NWords, 0);
00952 vector<bool> counted(NWords, false);
00953
00954 typename vector<vector<TDescriptor> >::const_iterator mit;
00955 typename vector<TDescriptor>::const_iterator fit;
00956
00957 for(mit = training_features.begin(); mit != training_features.end(); ++mit)
00958 {
00959 fill(counted.begin(), counted.end(), false);
00960
00961 for(fit = mit->begin(); fit < mit->end(); ++fit)
00962 {
00963 WordId word_id;
00964 transform(*fit, word_id);
00965
00966 if(!counted[word_id])
00967 {
00968 Ni[word_id]++;
00969 counted[word_id] = true;
00970 }
00971 }
00972 }
00973
00974
00975 for(unsigned int i = 0; i < NWords; i++)
00976 {
00977 if(Ni[i] > 0)
00978 {
00979 m_words[i]->weight = log((double)NDocs / (double)Ni[i]);
00980 }
00981 }
00982
00983 }
00984
00985 }
00986
00987
00988
00989 template<class TDescriptor, class F>
00990 inline unsigned int TemplatedVocabulary<TDescriptor,F>::size() const
00991 {
00992 return m_words.size();
00993 }
00994
00995
00996
00997 template<class TDescriptor, class F>
00998 inline bool TemplatedVocabulary<TDescriptor,F>::empty() const
00999 {
01000 return m_words.empty();
01001 }
01002
01003
01004
01005 template<class TDescriptor, class F>
01006 float TemplatedVocabulary<TDescriptor,F>::getEffectiveLevels() const
01007 {
01008 long sum = 0;
01009 typename std::vector<Node*>::const_iterator wit;
01010 for(wit = m_words.begin(); wit != m_words.end(); ++wit)
01011 {
01012 const Node *p = *wit;
01013
01014 for(; p->id != 0; sum++) p = &m_nodes[p->parent];
01015 }
01016
01017 return (float)((double)sum / (double)m_words.size());
01018 }
01019
01020
01021
01022 template<class TDescriptor, class F>
01023 TDescriptor TemplatedVocabulary<TDescriptor,F>::getWord(WordId wid) const
01024 {
01025 return m_words[wid]->descriptor;
01026 }
01027
01028
01029
01030 template<class TDescriptor, class F>
01031 WordValue TemplatedVocabulary<TDescriptor, F>::getWordWeight(WordId wid) const
01032 {
01033 return m_words[wid]->weight;
01034 }
01035
01036
01037
01038 template<class TDescriptor, class F>
01039 WordId TemplatedVocabulary<TDescriptor, F>::transform
01040 (const TDescriptor& feature) const
01041 {
01042 if(empty())
01043 {
01044 return 0;
01045 }
01046
01047 WordId wid;
01048 transform(feature, wid);
01049 return wid;
01050 }
01051
01052
01053
01054 template<class TDescriptor, class F>
01055 void TemplatedVocabulary<TDescriptor,F>::transform(
01056 const std::vector<TDescriptor>& features, BowVector &v) const
01057 {
01058 v.clear();
01059
01060 if(empty())
01061 {
01062 return;
01063 }
01064
01065
01066 LNorm norm;
01067 bool must = m_scoring_object->mustNormalize(norm);
01068
01069 typename vector<TDescriptor>::const_iterator fit;
01070
01071 if(m_weighting == TF || m_weighting == TF_IDF)
01072 {
01073 for(fit = features.begin(); fit < features.end(); ++fit)
01074 {
01075 WordId id;
01076 WordValue w;
01077
01078
01079 transform(*fit, id, w);
01080
01081
01082 if(w > 0) v.addWeight(id, w);
01083 }
01084
01085 if(!v.empty() && !must)
01086 {
01087
01088 const double nd = v.size();
01089 for(BowVector::iterator vit = v.begin(); vit != v.end(); vit++)
01090 vit->second /= nd;
01091 }
01092
01093 }
01094 else
01095 {
01096 for(fit = features.begin(); fit < features.end(); ++fit)
01097 {
01098 WordId id;
01099 WordValue w;
01100
01101
01102 transform(*fit, id, w);
01103
01104
01105 if(w > 0) v.addIfNotExist(id, w);
01106
01107 }
01108 }
01109
01110 if(must) v.normalize(norm);
01111 }
01112
01113
01114
01115 template<class TDescriptor, class F>
01116 void TemplatedVocabulary<TDescriptor,F>::transform(
01117 const std::vector<TDescriptor>& features,
01118 BowVector &v, FeatureVector &fv, int levelsup) const
01119 {
01120 v.clear();
01121 fv.clear();
01122
01123 if(empty())
01124 {
01125 return;
01126 }
01127
01128
01129 LNorm norm;
01130 bool must = m_scoring_object->mustNormalize(norm);
01131
01132 typename vector<TDescriptor>::const_iterator fit;
01133
01134 if(m_weighting == TF || m_weighting == TF_IDF)
01135 {
01136 unsigned int i_feature = 0;
01137 for(fit = features.begin(); fit < features.end(); ++fit, ++i_feature)
01138 {
01139 WordId id;
01140 NodeId nid;
01141 WordValue w;
01142
01143
01144 transform(*fit, id, w, &nid, levelsup);
01145
01146 if(w > 0)
01147 {
01148 v.addWeight(id, w);
01149 fv.addFeature(nid, i_feature);
01150 }
01151 }
01152
01153 if(!v.empty() && !must)
01154 {
01155
01156 const double nd = v.size();
01157 for(BowVector::iterator vit = v.begin(); vit != v.end(); vit++)
01158 vit->second /= nd;
01159 }
01160
01161 }
01162 else
01163 {
01164 unsigned int i_feature = 0;
01165 for(fit = features.begin(); fit < features.end(); ++fit, ++i_feature)
01166 {
01167 WordId id;
01168 NodeId nid;
01169 WordValue w;
01170
01171
01172 transform(*fit, id, w, &nid, levelsup);
01173
01174 if(w > 0)
01175 {
01176 v.addIfNotExist(id, w);
01177 fv.addFeature(nid, i_feature);
01178 }
01179 }
01180 }
01181
01182 if(must) v.normalize(norm);
01183 }
01184
01185
01186
01187 template<class TDescriptor, class F>
01188 inline double TemplatedVocabulary<TDescriptor,F>::score
01189 (const BowVector &v1, const BowVector &v2) const
01190 {
01191 return m_scoring_object->score(v1, v2);
01192 }
01193
01194
01195
01196 template<class TDescriptor, class F>
01197 void TemplatedVocabulary<TDescriptor,F>::transform
01198 (const TDescriptor &feature, WordId &id) const
01199 {
01200 WordValue weight;
01201 transform(feature, id, weight);
01202 }
01203
01204
01205
01206 template<class TDescriptor, class F>
01207 void TemplatedVocabulary<TDescriptor,F>::transform(const TDescriptor &feature,
01208 WordId &word_id, WordValue &weight, NodeId *nid, int levelsup) const
01209 {
01210
01211 vector<NodeId> nodes;
01212 typename vector<NodeId>::const_iterator nit;
01213
01214
01215 const int nid_level = m_L - levelsup;
01216 if(nid_level <= 0 && nid != NULL) *nid = 0;
01217
01218 NodeId final_id = 0;
01219 int current_level = 0;
01220
01221 do
01222 {
01223 ++current_level;
01224 nodes = m_nodes[final_id].children;
01225 final_id = nodes[0];
01226
01227 double best_d = F::distance(feature, m_nodes[final_id].descriptor);
01228
01229 for(nit = nodes.begin() + 1; nit != nodes.end(); ++nit)
01230 {
01231 NodeId id = *nit;
01232 double d = F::distance(feature, m_nodes[id].descriptor);
01233 if(d < best_d)
01234 {
01235 best_d = d;
01236 final_id = id;
01237 }
01238 }
01239
01240 if(nid != NULL && current_level == nid_level)
01241 *nid = final_id;
01242
01243 } while( !m_nodes[final_id].isLeaf() );
01244
01245
01246 word_id = m_nodes[final_id].word_id;
01247 weight = m_nodes[final_id].weight;
01248 }
01249
01250
01251
01252 template<class TDescriptor, class F>
01253 NodeId TemplatedVocabulary<TDescriptor,F>::getParentNode
01254 (WordId wid, int levelsup) const
01255 {
01256 NodeId ret = m_words[wid]->id;
01257 while(levelsup > 0 && ret != 0)
01258 {
01259 --levelsup;
01260 ret = m_nodes[ret].parent;
01261 }
01262 return ret;
01263 }
01264
01265
01266
01267 template<class TDescriptor, class F>
01268 void TemplatedVocabulary<TDescriptor,F>::getWordsFromNode
01269 (NodeId nid, std::vector<WordId> &words) const
01270 {
01271 words.clear();
01272
01273 if(m_nodes[nid].isLeaf())
01274 {
01275 words.push_back(m_nodes[nid].word_id);
01276 }
01277 else
01278 {
01279 words.reserve(m_k);
01280
01281 vector<NodeId> parents;
01282 parents.push_back(nid);
01283
01284 while(!parents.empty())
01285 {
01286 NodeId parentid = parents.back();
01287 parents.pop_back();
01288
01289 const vector<NodeId> &child_ids = m_nodes[parentid].children;
01290 vector<NodeId>::const_iterator cit;
01291
01292 for(cit = child_ids.begin(); cit != child_ids.end(); ++cit)
01293 {
01294 const Node &child_node = m_nodes[*cit];
01295
01296 if(child_node.isLeaf())
01297 words.push_back(child_node.word_id);
01298 else
01299 parents.push_back(*cit);
01300
01301 }
01302 }
01303 }
01304 }
01305
01306
01307
01308 template<class TDescriptor, class F>
01309 int TemplatedVocabulary<TDescriptor,F>::stopWords(double minWeight)
01310 {
01311 int c = 0;
01312 typename vector<Node*>::iterator wit;
01313 for(wit = m_words.begin(); wit != m_words.end(); ++wit)
01314 {
01315 if((*wit)->weight < minWeight)
01316 {
01317 ++c;
01318 (*wit)->weight = 0;
01319 }
01320 }
01321 return c;
01322 }
01323
01324
01325
01326 template<class TDescriptor, class F>
01327 void TemplatedVocabulary<TDescriptor,F>::save(const std::string &filename) const
01328 {
01329 cv::FileStorage fs(filename.c_str(), cv::FileStorage::WRITE);
01330 if(!fs.isOpened()) throw string("Could not open file ") + filename;
01331
01332 save(fs);
01333 }
01334
01335
01336
01337 template<class TDescriptor, class F>
01338 void TemplatedVocabulary<TDescriptor,F>::load(const std::string &filename)
01339 {
01340 cv::FileStorage fs(filename.c_str(), cv::FileStorage::READ);
01341 if(!fs.isOpened()) throw string("Could not open file ") + filename;
01342
01343 this->load(fs);
01344 }
01345
01346
01347
01348 template<class TDescriptor, class F>
01349 void TemplatedVocabulary<TDescriptor,F>::save(cv::FileStorage &f,
01350 const std::string &name) const
01351 {
01352
01353
01354
01355
01356
01357
01358
01359
01360
01361
01362
01363
01364
01365
01366
01367
01368
01369
01370
01371
01372
01373
01374
01375
01376
01377
01378
01379
01380 f << name << "{";
01381
01382 f << "k" << m_k;
01383 f << "L" << m_L;
01384 f << "scoringType" << m_scoring;
01385 f << "weightingType" << m_weighting;
01386
01387
01388 f << "nodes" << "[";
01389 vector<NodeId> parents, children;
01390 vector<NodeId>::const_iterator pit;
01391
01392 parents.push_back(0);
01393
01394 while(!parents.empty())
01395 {
01396 NodeId pid = parents.back();
01397 parents.pop_back();
01398
01399 const Node& parent = m_nodes[pid];
01400 children = parent.children;
01401
01402 for(pit = children.begin(); pit != children.end(); pit++)
01403 {
01404 const Node& child = m_nodes[*pit];
01405
01406
01407 f << "{:";
01408 f << "nodeId" << (int)child.id;
01409 f << "parentId" << (int)pid;
01410 f << "weight" << (double)child.weight;
01411 f << "descriptor" << F::toString(child.descriptor);
01412 f << "}";
01413
01414
01415 if(!child.isLeaf())
01416 {
01417 parents.push_back(*pit);
01418 }
01419 }
01420 }
01421
01422 f << "]";
01423
01424
01425 f << "words" << "[";
01426
01427 typename vector<Node*>::const_iterator wit;
01428 for(wit = m_words.begin(); wit != m_words.end(); wit++)
01429 {
01430 WordId id = wit - m_words.begin();
01431 f << "{:";
01432 f << "wordId" << (int)id;
01433 f << "nodeId" << (int)(*wit)->id;
01434 f << "}";
01435 }
01436
01437 f << "]";
01438
01439 f << "}";
01440
01441 }
01442
01443
01444
01445 template<class TDescriptor, class F>
01446 void TemplatedVocabulary<TDescriptor,F>::load(const cv::FileStorage &fs,
01447 const std::string &name)
01448 {
01449 m_words.clear();
01450 m_nodes.clear();
01451
01452 cv::FileNode fvoc = fs[name];
01453
01454 m_k = (int)fvoc["k"];
01455 m_L = (int)fvoc["L"];
01456 m_scoring = (ScoringType)((int)fvoc["scoringType"]);
01457 m_weighting = (WeightingType)((int)fvoc["weightingType"]);
01458
01459 createScoringObject();
01460
01461
01462 cv::FileNode fn = fvoc["nodes"];
01463
01464 m_nodes.resize(fn.size() + 1);
01465 m_nodes[0].id = 0;
01466
01467 for(unsigned int i = 0; i < fn.size(); ++i)
01468 {
01469 NodeId nid = (int)fn[i]["nodeId"];
01470 NodeId pid = (int)fn[i]["parentId"];
01471 WordValue weight = (WordValue)fn[i]["weight"];
01472 string d = (string)fn[i]["descriptor"];
01473
01474 m_nodes[nid].id = nid;
01475 m_nodes[nid].parent = pid;
01476 m_nodes[nid].weight = weight;
01477 m_nodes[pid].children.push_back(nid);
01478
01479 F::fromString(m_nodes[nid].descriptor, d);
01480 }
01481
01482
01483 fn = fvoc["words"];
01484
01485 m_words.resize(fn.size());
01486
01487 for(unsigned int i = 0; i < fn.size(); ++i)
01488 {
01489 NodeId wid = (int)fn[i]["wordId"];
01490 NodeId nid = (int)fn[i]["nodeId"];
01491
01492 m_nodes[nid].word_id = wid;
01493 m_words[wid] = &m_nodes[nid];
01494 }
01495 }
01496
01497
01498
01504 template<class TDescriptor, class F>
01505 std::ostream& operator<<(std::ostream &os,
01506 const TemplatedVocabulary<TDescriptor,F> &voc)
01507 {
01508 os << "Vocabulary: k = " << voc.getBranchingFactor()
01509 << ", L = " << voc.getDepthLevels()
01510 << ", Weighting = ";
01511
01512 switch(voc.getWeightingType())
01513 {
01514 case TF_IDF: os << "tf-idf"; break;
01515 case TF: os << "tf"; break;
01516 case IDF: os << "idf"; break;
01517 case BINARY: os << "binary"; break;
01518 }
01519
01520 os << ", Scoring = ";
01521 switch(voc.getScoringType())
01522 {
01523 case L1_NORM: os << "L1-norm"; break;
01524 case L2_NORM: os << "L2-norm"; break;
01525 case CHI_SQUARE: os << "Chi square distance"; break;
01526 case KL: os << "KL-divergence"; break;
01527 case BHATTACHARYYA: os << "Bhattacharyya coefficient"; break;
01528 case DOT_PRODUCT: os << "Dot product"; break;
01529 }
01530
01531 os << ", Number of words = " << voc.size();
01532
01533 return os;
01534 }
01535
01536 }
01537
01538 #endif