13 #ifndef MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP
14 #define MLPACK_CORE_DATA_STRING_ENCODING_POLICIES_TF_IDF_ENCODING_POLICY_HPP
76 const bool smoothIdf =
true) :
86 tokensFrequences.clear();
87 numContainingStrings.clear();
103 template<
typename MatType>
105 const size_t datasetSize,
107 const size_t dictionarySize)
109 output.zeros(dictionarySize, datasetSize);
126 template<
typename ElemType>
127 static void InitMatrix(std::vector<std::vector<ElemType>>& output,
128 const size_t datasetSize,
130 const size_t dictionarySize)
132 output.resize(datasetSize, std::vector<ElemType>(dictionarySize));
147 template<
typename MatType>
153 const typename MatType::elem_type tf =
154 TermFrequency<typename MatType::elem_type>(
155 tokensFrequences[line][value], linesSizes[line]);
157 const typename MatType::elem_type idf =
158 InverseDocumentFrequency<typename MatType::elem_type>(
159 output.n_cols, numContainingStrings[value]);
161 output(value - 1, line) = tf * idf;
179 template<
typename ElemType>
180 void Encode(std::vector<std::vector<ElemType>>& output,
185 const ElemType tf = TermFrequency<ElemType>(
186 tokensFrequences[line][value], linesSizes[line]);
188 const ElemType idf = InverseDocumentFrequency<ElemType>(
189 output.size(), numContainingStrings[value]);
191 output[line][value - 1] = tf * idf;
206 if (line >= tokensFrequences.size())
208 linesSizes.resize(line + 1);
209 tokensFrequences.resize(line + 1);
212 tokensFrequences[line][value]++;
214 if (tokensFrequences[line][value] == 1)
215 numContainingStrings[value]++;
221 const std::vector<std::unordered_map<size_t, size_t>>&
226 return tokensFrequences;
232 return numContainingStrings;
238 return numContainingStrings;
242 const std::vector<size_t>&
LinesSizes()
const {
return linesSizes; }
259 template<
typename Archive>
262 ar & BOOST_SERIALIZATION_NVP(tfType);
263 ar & BOOST_SERIALIZATION_NVP(smoothIdf);
276 template<
typename ValueType>
277 ValueType TermFrequency(
const size_t numOccurrences,
278 const size_t numTokens)
283 return numOccurrences > 0;
285 return numOccurrences;
287 return static_cast<ValueType
>(numOccurrences) / numTokens;
289 return std::log(
static_cast<ValueType
>(numOccurrences)) + 1;
291 Log::Fatal <<
"Incorrect term frequency type!";
305 template<
typename ValueType>
306 ValueType InverseDocumentFrequency(
const size_t totalNumLines,
307 const size_t numOccurrences)
311 return std::log(
static_cast<ValueType
>(totalNumLines + 1) /
312 (1 + numOccurrences)) + 1.0;
316 return std::log(
static_cast<ValueType
>(totalNumLines) /
317 numOccurrences) + 1.0;
323 std::vector<std::unordered_map<size_t, size_t>> tokensFrequences;
328 std::unordered_map<size_t, size_t> numContainingStrings;
330 std::vector<size_t> linesSizes;
343 template<
typename TokenType>