mlpack  2.2.5
dataset_mapper.hpp
Go to the documentation of this file.
1 
15 #ifndef MLPACK_CORE_DATA_DATASET_INFO_HPP
16 #define MLPACK_CORE_DATA_DATASET_INFO_HPP
17 
18 #include <mlpack/prereqs.hpp>
19 #include <unordered_map>
20 #include <boost/bimap.hpp>
21 
23 
24 namespace mlpack {
25 namespace data {
35 template <typename PolicyType>
37 {
38  public:
44  explicit DatasetMapper(const size_t dimensionality = 0);
45 
51  explicit DatasetMapper(PolicyType& policy, const size_t dimensionality = 0);
52 
60  template<typename T>
61  void MapFirstPass(const std::string& string, const size_t dimension);
62 
73  template<typename T>
74  T MapString(const std::string& string,
75  const size_t dimension);
76 
85  const std::string& UnmapString(const size_t value, const size_t dimension);
86 
87 
96  typename PolicyType::MappedType UnmapValue(const std::string& string,
97  const size_t dimension);
98 
111  template <typename eT>
112  void MapTokens(const std::vector<std::string>& tokens, size_t& row,
113  arma::Mat<eT>& matrix);
114 
116  Datatype Type(const size_t dimension) const;
118  Datatype& Type(const size_t dimension);
119 
124  size_t NumMappings(const size_t dimension) const;
125 
132  size_t Dimensionality() const;
133 
137  template<typename Archive>
138  void Serialize(Archive& ar, const unsigned int /* version */)
139  {
140  ar & data::CreateNVP(types, "types");
141  ar & data::CreateNVP(maps, "maps");
142  }
143 
145  const PolicyType& Policy() const;
146 
148  PolicyType& Policy();
150  void Policy(PolicyType&& policy);
151 
152  private:
154  std::vector<Datatype> types;
155 
156  // BiMapType definition
157  using BiMapType = boost::bimap<std::string, typename PolicyType::MappedType>;
158 
159  // Mappings from strings to integers.
160  // Map entries will only exist for dimensions that are categorical.
161  // MapType = map<dimension, pair<bimap<string, MappedType>, numMappings>>
162  using MapType = std::unordered_map<size_t, std::pair<BiMapType, size_t>>;
163 
165  MapType maps;
166 
168  // mapped to the maps object. It is used in MapString() and MapTokens().
169  PolicyType policy;
170 };
171 
172 // Use typedef to provide backward compatibility
174 
175 } // namespace data
176 } // namespace mlpack
177 
178 #include "dataset_mapper_impl.hpp"
179 
180 #endif
Auxiliary information for a dataset, including mappings to/from strings and the datatype of each dime...
DatasetMapper(const size_t dimensionality=0)
Create the DatasetMapper object with the given dimensionality.
T MapString(const std::string &string, const size_t dimension)
Given the string and the dimension to which it belongs, return its numeric mapping.
Linear algebra utility functions, generally performed on matrices or vectors.
Definition: binarize.hpp:18
FirstShim< T > CreateNVP(T &t, const std::string &name, typename boost::enable_if< HasSerialize< T >>::type *=0)
Call this function to produce a name-value pair; this is similar to BOOST_SERIALIZATION_NVP(), but should be used for types that have a Serialize() function (or contain a type that has a Serialize() function) instead of a serialize() function.
void MapFirstPass(const std::string &string, const size_t dimension)
Preprocessing: during a first pass of the data, pass the strings on to the MapPolicy if they are need...
The core includes that mlpack expects; standard C++ includes and Armadillo.
Datatype Type(const size_t dimension) const
Return the type of a given dimension (numeric or categorical).
size_t NumMappings(const size_t dimension) const
Get the number of mappings for a particular dimension.
const std::string & UnmapString(const size_t value, const size_t dimension)
Return the string that corresponds to a given value in a given dimension.
PolicyType::MappedType UnmapValue(const std::string &string, const size_t dimension)
Return the value that corresponds to a given string in a given dimension.
Datatype
The Datatype enum specifies the types of data mlpack algorithms can use.
Definition: datatype.hpp:24
size_t Dimensionality() const
Get the dimensionality of the DatasetMapper object (that is, how many dimensions it has information f...
const PolicyType & Policy() const
Return the policy of the mapper.
void Serialize(Archive &ar, const unsigned int)
Serialize the dataset information.
void MapTokens(const std::vector< std::string > &tokens, size_t &row, arma::Mat< eT > &matrix)
MapTokens turns vector of strings into numeric variables and puts them into a given matrix...