bes  Updated for version 3.20.10
reduce_mdf.cc
1 #include <iostream>
2 #include <sstream>
3 #include <fstream>
4 #include <string.h>
5 #include <stdio.h>
6 #include <vector>
7 #include <openssl/sha.h>
8 #include <unistd.h>
9 
10 using namespace std;
11 
12 bool obtain_offset_nbytes(const vector<string>& str_vec, vector<size_t>& offsets, vector<size_t>& nbytes);
13 
14 void string_tokenize(const string &in_str,const char delim,vector<string>&out_vec);
15 
16 size_t string_to_size_t(const string& str);
17 
18 bool retrieve_chunk_info(FILE*,vector<size_t> &offsets,vector<size_t> &nbytes);
19 
20 string retrieve_data_sha256(FILE*,const vector<size_t> &offsets,const vector<size_t> &nbytes);
21 
22 short write_sha256_file(char* m_dmrpp_fname,char* m_h5_fname,char* m_sha256_fname,const string & sha256_buf);
23 
24 short update_sha256_file(char* m_dmrpp_fname,char* m_h5_fname,char* m_sha256_fname,char*stored_fname,const string & sha256_buf);
25 
26 string to_hex(unsigned char s) {
27  stringstream ss;
28  ss << hex << (int) s;
29  return ss.str();
30 }
31 
32 // If the return value is 0, the sha256 exists, no need to use the generated HDF5 file.
33 // If the return value is 1, the sha256 doesn't exist, need to use the generated HDF5 file.
34 int main(int argc,char **argv ) {
35 
36  if(argc !=5) {
37  cout<<"Please provide four arguments: "<< endl;
38  cout<<" The first is the dmrpp file that contains the missing variable value information. "<<endl;
39  cout<<" The second is the hdf5 file path that stores the missing variable values. "<<endl;
40  cout<<" The third is the text file that stores the file path and the sha256 value." <<endl;
41  cout<<" The fourth is the text file that stores the final HDF5 file path for this dmrpp file. "<<endl;
42  }
43 
44  // Retrieve the chunk info from the dmrpp file.
45  FILE* fp_dmrpp = fopen(argv[1],"r");
46  if(fp_dmrpp == NULL) {
47  cout<<"The dmrpp file doesn't exist"<<endl;
48  return -1;
49  }
50 
51  vector<size_t>offsets;
52  vector<size_t>nbytes;
53  bool ret_chunk = retrieve_chunk_info(fp_dmrpp,offsets,nbytes);
54  if(false == ret_chunk) {
55  cout<<"Cannot retrieve the chunk info from the dmrpp file successfully. "<<endl;
56  return -1;
57  }
58  fclose(fp_dmrpp);
59 
60  // Obtain the sha256.
61  FILE* fp_h5 = fopen(argv[2],"r");
62  if(fp_h5 == NULL) {
63  cout<<"The HDF5 file doesn't exist"<<endl;
64  return -1;
65  }
66 
67  string sha256_buf = retrieve_data_sha256(fp_h5,offsets,nbytes);
68  if(sha256_buf=="") {
69  cout<<"The sha256 of this file doesn't exist"<<endl;
70  return -1;
71  }
72  fclose(fp_h5);
73 
74  // Store the sha256 if necessary to a file.
75  short ret_value = update_sha256_file(argv[1],argv[2],argv[3],argv[4],sha256_buf);
76  //return ret_value;
77  return ret_value;
78 }
79 
80 // Append the sha256 to a file.
81 short write_sha256_file(char* m_dmrpp_fname,char* m_h5_fname,char* m_sha256_fname,const string & sha256_buf) {
82 
83  short sha_fname_ret = 1;
84  FILE*fp = fopen(m_sha256_fname,"a");
85  string fname_str(m_h5_fname);
86  string dname_str(m_dmrpp_fname);
87  string file_content = fname_str +' '+dname_str+' '+sha256_buf+'\n';
88  vector<char>buf(file_content.begin(),file_content.end());
89  size_t fsize = fwrite(&buf[0],1,file_content.size(),fp);
90  if(fsize != file_content.size())
91  sha_fname_ret = -1;
92  fclose(fp);
93 
94  return sha_fname_ret;
95 
96 }
97 
98 // Update the sha256 in the recording file if necessary.
99 short update_sha256_file(char* m_dmrpp_fname,char* m_h5_fname,char* m_sha256_fname,char* store_h5_fname,const string & sha256_buf) {
100 
101  // If the recording file that stores thesha256 doesn't exist,
102  // just create this file and write the sha256 etc information to the file.
103 
104  /* removed 11.10.20 SBL
105  * removed due to sonarcloud claiming that having the following before the if statement on ln 126
106  * causes a race condition and a security vulnerability
107  */
108  //if(access(m_sha256_fname,F_OK)==-1)
109  // return write_sha256_file(m_dmrpp_fname,m_h5_fname,m_sha256_fname,sha256_buf);
110 
111  //
112  // If the recording file exists, open this file and see if the sha256 of
113  // this missing data can be found from the recording file.
114  // If the sha256 can be found,then the missing data file exists, we don't
115  // need to create a new one, otherwise, a new one needs to be created.
116  // If the sha256 can be found, we need to create a temp. text file to store
117  // the missing data file name so that this information can be passed to
118  // the patched dmrpp program afterwards.
119  short ret_value = 1;
120  ifstream sha_fstream;
121  sha_fstream.open(m_sha256_fname,ifstream::in);
122  /* added 11.10.20 SBL
123  * added check to fix race condition vulnerability detected by SonarCloud
124  * checks if stream was opened correctly and if not creates the file and opens again
125  */
126  if (!sha_fstream.is_open()){
127  write_sha256_file(m_dmrpp_fname,m_h5_fname,m_sha256_fname,sha256_buf);
128  sha_fstream.open(m_sha256_fname,ifstream::in);
129  }
130  string sha_line;
131  char space_char=' ';
132  //char end_line='\n';
133  bool space_fname_ret = true;
134  bool need_add_sha256 = true;
135 
136  while(getline(sha_fstream,sha_line)) {
137 
138  size_t fname_epos = sha_line.find(space_char);
139  if(fname_epos==string::npos) {
140  space_fname_ret = false;
141  break;
142  }
143 
144  size_t dname_epos = sha_line.find(space_char,fname_epos+1);
145  if(dname_epos==string::npos) {
146  space_fname_ret = false;
147  break;
148  }
149 
150  string f_sha256_buf = sha_line.substr(dname_epos+1);
151  if(f_sha256_buf == sha256_buf) {
152 
153  need_add_sha256 = false;
154 
155  string exist_m_h5_name = sha_line.substr(0,fname_epos);
156  string exist_m_dmrpp_name = sha_line.substr(fname_epos+1,dname_epos-fname_epos-1);
157 
158  // Open the file to store the HDF5 and dmrpp file
159  FILE*fp = fopen(store_h5_fname,"a");
160  string file_content = exist_m_h5_name +' '+exist_m_dmrpp_name;
161  vector<char>buf(file_content.begin(),file_content.end());
162  size_t fsize = fwrite(&buf[0],1,file_content.size(),fp);
163  if(fsize != file_content.size())
164  ret_value = -1;
165  fclose(fp);
166  break;
167  }
168  }
169  sha_fstream.close();
170 
171 
172  if(false == space_fname_ret)
173  ret_value = -1;
174  if(false == need_add_sha256)
175  ret_value = 0;
176 
177  // sha256 is not found, append this sha256 and the missing data file name to the recording file.
178  if(true == space_fname_ret) {
179  if(true == need_add_sha256) {
180  ret_value = write_sha256_file(m_dmrpp_fname,m_h5_fname,m_sha256_fname,sha256_buf);
181  }
182  }
183 
184  return ret_value;
185 }
186 
187 // Obtain the sha256 from the data values.
188 string retrieve_data_sha256(FILE*fp,const vector<size_t> &offsets,const vector<size_t> &nbytes){
189 
190  string ret_str;
191  size_t fSize = 0;
192  unsigned char hash[SHA256_DIGEST_LENGTH];
193 
194  // This is the buffer size
195  for(size_t i = 0; i <nbytes.size();i++)
196  fSize+=nbytes[i];
197 
198  // Read in the offset and byte information.
199  vector<char>buf;
200  buf.resize(fSize);
201 
202  size_t cur_size = 0;
203  for(size_t i = 0; i<offsets.size();i++) {
204  // Seek according to offset
205  if(fseek(fp,offsets[i],SEEK_SET)!=0)
206  return ret_str;
207  /* unused size_t result =*/ fread(&buf[cur_size],1,nbytes[i],fp);
208  cur_size +=nbytes[i];
209  }
210 
211  // Calculate the hash
212  SHA256((const unsigned char*)&buf[0],fSize,hash);
213 
214  string output="";
215 
216  // Change 256 to hex and to a string
217  for(int i =0; i<SHA256_DIGEST_LENGTH;i++)
218  output+=to_hex(hash[i]);
219 
220  return output;
221 }
222 
223 // Retrieve the offsets and number of bytes of variable values.
224 bool retrieve_chunk_info(FILE*fp,vector<size_t> &offsets,vector<size_t> &nbytes) {
225 
226  size_t fSize = 0;
227 
228  // Read in the offset and byte information.
229  if(fseek(fp,0,SEEK_END)!=0)
230  return false;
231  fSize = ftell(fp);
232 #if 0
233  // fSize is unsigned. jhrg 11/23/21
234  if(fSize <0)
235  return false;
236 #endif
237 
238  if(fseek(fp,0,SEEK_SET)!=0)
239  return false;
240 
241  vector<char>buf;
242  buf.resize((size_t)fSize);
243  size_t result = fread(&buf[0],1,fSize,fp);
244  if(result != fSize)
245  return false;
246 
247  string str(buf.begin(),buf.end());
248  char delim='\n';
249  vector<string> str_vec;
250  string_tokenize(str,delim,str_vec);
251 
252  bool get_offset_nbytes = obtain_offset_nbytes(str_vec,offsets,nbytes);
253  if(false == get_offset_nbytes) {
254  cout<<"cannot successfully obtain the offset and nbytes. \n";
255  return false;
256  }
257 
258 #if 0
259  for (int i = 0; i <offsets.size();i++) {
260  cout<<"offset["<<i<<"]= " <<offsets[i] <<endl;
261  cout<<"nbyte["<<i<<"]= " <<nbytes[i] <<endl;
262  }
263 #endif
264 
265  return get_offset_nbytes;
266 
267 }
268 
269 // Obtain the offset and number of bytes from the dmrpp file.
270 // Here we don't need to worry about the filters. We just want to
271 // make sure the data values(either in compressed form or uncompressed form)
272 // can be retrieved.
273 bool obtain_offset_nbytes(const vector<string>& str_vec, vector<size_t>& offsets, vector<size_t>& nbytes){
274 
275  bool ret=true;
276  vector<string>chunk_info_str;
277  string delim1 ="chunk offset=\"";
278  string delim2 ="nBytes=\"";
279  string delim3="\"";
280 
281  vector<size_t> unfiltered_offsets;
282  vector<size_t> unfiltered_nbytes;
283 
284  // Pick up the line that includes chunk offset and save them to a vector.
285  for(size_t i = 0; i <str_vec.size(); i++)
286  if(str_vec[i].find(delim1)!=string::npos)
287  chunk_info_str.push_back(str_vec[i]);
288 
289  // Obtain the offsets and number of bytes and save them to vectors.
290  for(size_t i = 0; i<chunk_info_str.size();i++) {
291  size_t co_spos = chunk_info_str[i].find(delim1);
292  size_t co_epos = chunk_info_str[i].find(delim3,co_spos+delim1.size());
293  if(co_epos==string::npos) {
294  ret = false;
295  break;
296  }
297  string temp_offset=chunk_info_str[i].substr(co_spos+delim1.size(),co_epos-co_spos-delim1.size());
298  unfiltered_offsets.push_back(string_to_size_t(temp_offset));
299 
300  size_t nb_spos = chunk_info_str[i].find(delim2,co_epos);
301  size_t nb_epos = chunk_info_str[i].find(delim3,nb_spos+delim2.size());
302  if(nb_epos==string::npos) {
303  ret = false;
304  break;
305  }
306  string temp_nbyte=chunk_info_str[i].substr(nb_spos+delim2.size(),nb_epos-nb_spos-delim2.size());
307  unfiltered_nbytes.push_back(string_to_size_t(temp_nbyte));
308 
309  }
310 
311  // Remove nbyte = 0 case. This is a bug caused by build_dmrpp. Before that is fixed, we
312  // remove this case since this fortuately doesn't affect our purpose and the patch_dmrpp program.
313  if(true == ret) {
314  for(size_t i = 0; i<unfiltered_nbytes.size();i++) {
315  if(unfiltered_nbytes[i] != 0) {
316  offsets.push_back(unfiltered_offsets[i]);
317  nbytes.push_back(unfiltered_nbytes[i]);
318  }
319  }
320  }
321 
322  return ret;
323 }
324 
325 // Tokenize the string to a vector of string according to the delimiter
326 void string_tokenize(const string &in_str,const char delim,vector<string>&out_vec) {
327  stringstream ss_str(in_str);
328  string temp_str;
329  while (getline(ss_str,temp_str,delim)) {
330  out_vec.push_back(temp_str);
331  }
332 }
333 
334 // Convert string to size_t.
335 size_t string_to_size_t(const string& str) {
336  stringstream sstream(str);
337  size_t str_num;
338  sstream >>str_num;
339  return str_num;
340 }
341 
342