c++ - Merge N log files maintaining the chronological order -


i have n different log files coming n different services running on our device. want merge n files single file maintaining chronological order. file size can vary few kb gb.

the n log files have same format, , like:

**********  logging session started ************ * hmsoa version: 2.4.0.12 * exe path: c:\program files (x86)\silicon biosystems\deparray300a_driver\deparray300a_driver.exe * exe version: 1.6.0.154 ************************************************   time = 2017/02/01 11:12:12,180 ; thid = 4924; cat = ; lvl = 1000; log = api 'connect'->enter; time = 2017/02/01 11:12:12,196 ; thid = 4924; cat = ; lvl = 1000; log = api 'connect'->exit=0; time = 2017/02/01 11:12:12,196 ; thid = 4924; cat = ; lvl = 1000; log = api 'ccisproxylocal connect - ok'->enter; time = 2017/02/01 11:12:12,196 ; thid = 4924; cat = ; lvl = 1000; log = api 'crecoveryaxesproxylocal connect - ok'->enter; time = 2017/02/01 11:12:12,196 ; thid = 4924; cat = ; lvl = 1000; log = api 'camplifierproxylocalv3 connect - ok'->enter; time = 2017/02/01 11:12:12,196 ; thid = 4924; cat = ; lvl = 1000; log = api 'system_diagnosis_get'->enter; time = 2017/02/01 11:12:12,211 ; thid = 4924; cat = ; lvl = 1000; log = api 'system_diagnosis_get'->exit=0; time = 2017/02/01 11:12:12,211 ; thid = 4924; cat = ; lvl = 1000; log = api 'lbl_square_set'->enter; time = 2017/02/01 11:12:12,219 ; thid = 4924; cat = ; lvl = 1000; log = api 'lbl_square_set'->exit=0; 

since have n different files, did far apply external sorting algorithm reading single line each file:

#include "stdafx.h" #include "boost/regex.hpp" #include "boost/lexical_cast.hpp" #include "boost\filesystem.hpp" #include <string> #include <fstream> #include <iostream> #include <algorithm> #include <sstream> #include <climits> #include <ctime> namespace fs = boost::filesystem;  static const boost::regex expression(r"(^(?:(?:time\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))"); static const boost::regex namefileex(r"(^[\d\-\_]+(\w+\s?\w+|\w+))"); static const std::string path("e:\\2017-02-01");  //static const std::string path("e:\\testlog");  unsigned long time2milleseconds(const std::string & time) {     int a, b, c, d;     if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3)         return * 3600000 + b * 60000 + c * 1000 + d; }  void readallfilesuntilline7(std::vector<std::pair<std::ifstream, std::string>> & vifs) {     std::string line;     (int = 0; < vifs.size(); ++i)     {         int linenumber = 0;         while (linenumber != 7 && std::getline(vifs[i].first, line))         {              ++linenumber;         }     } }  void checkregex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::vector<unsigned long> & logtime, std::vector<std::string> & lines, int index, int & counter) {     std::string line;     boost::smatch what;     if (std::getline(vifs[index].first, line))     {         if (boost::regex_search(line, what, expression))         {             logtime[index] = time2milleseconds(what[1]);         }         lines[index] = line;     }     else     {         --counter;         logtime[index] = ulong_max;     } }  void mergefiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::vector<unsigned long> & logtime, std::vector<std::string> & lines, std::ofstream & file, int & counter) {     std::string line;     boost::smatch what;     int index = 0;     (int = 0; < vifs.size(); ++i)     {         checkregex(vifs, logtime, lines, i, counter);     }     index = min_element(logtime.begin(), logtime.end()) - logtime.begin();     file << lines[index] << " --> " << vifs[index].second << "\n";     while (true)     {         checkregex(vifs, logtime, lines, index, counter);         index = min_element(logtime.begin(), logtime.end()) - logtime.begin();         if (0 == counter)             break;         file << lines[index] << " --> " << vifs[index].second << "\n";     } }  int main() {     clock_t begin = clock();     int cnt = std::count_if(fs::directory_iterator(path),fs::directory_iterator(),static_cast<bool(*)(const fs::path&)>(fs::is_regular_file));     std::vector<std::pair<std::ifstream, std::string>> vifs(cnt);     int index = 0;     boost::smatch what;     std::string file;     (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d)     {         if (fs::is_regular_file(d->path()))         {             file = d->path().filename().string();             if (boost::regex_search(file, what, namefileex))             {                 vifs[index++] = std::make_pair(std::ifstream(d->path().string()), what[1]);             }         }     }     std::vector<unsigned long> logtime(cnt, ulong_max);     std::vector<std::string> lines(cnt);     std::ofstream filename(path + "\\testlog.txt");     readallfilesuntilline7(vifs);     mergefiles(vifs, logtime, lines, filename, cnt);     filename.close();     clock_t end = clock();     double elapsed_secs = double(end - begin) / clocks_per_sec;     std::cout << "elapsed time = " << elapsed_secs << "\n";     return 0; } 

it supposed slow. merge 82 files sizes ranging 1 kb 250 mb , create final file more 6000000 lines takes 70 minutes.

how can speed algorithm? appreciated!

update

i have implemented version heap, too:

data.h:

#pragma once  #include <string>  class data { public:     data(dword index,          const std::string & line,          ulong time);     ~data();     inline const ulong gettime() const  {return time; }     inline const dword getindex() const { return index; }     inline const std::string getline() const { return line; } private:     dword index;     std::string line;     ulong time; };  class compare { public:     bool operator()(const data & lhs, const data & rhs) { return lhs.gettime() > rhs.gettime(); }; }; 

data.cpp:

#include "stdafx.h" #include "data.h"   data::data(dword i_index,            const std::string & i_line,            ulong i_time)     : index(i_index)     , line(i_line)     , time(i_time) { }   data::~data() { } 

main.cpp:

#include "stdafx.h" #include "boost/regex.hpp" #include "boost/lexical_cast.hpp" #include "boost\filesystem.hpp" #include <string> #include <fstream> #include <iostream> #include <algorithm> #include <sstream> #include <climits> #include <ctime> #include <queue> #include "data.h" namespace fs = boost::filesystem;  static const boost::regex expression(r"(^(?:(?:time\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))"); static const boost::regex namefileex(r"(^[\d\-\_]+(\w+\s?\w+|\w+))"); static const std::string path("e:\\2017-02-01"); //static const std::string path("e:\\testlog");  unsigned long time2milleseconds(const std::string & time) {     int a, b, c, d;     if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3)         return * 3600000 + b * 60000 + c * 1000 + d; }  void initializeheap(std::ifstream & ifs, std::priority_queue<data, std::vector<data>, compare> & myheap, const int index) {     ulong time;     std::string line;     boost::smatch what;     bool match = false;     while (!match && std::getline(ifs, line))     {         if (boost::regex_search(line, what, expression))         {             time = time2milleseconds(what[1]);             myheap.push(data(index, line, time));             match = true;         }     } }  void checkregex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<data, std::vector<data>, compare> & myheap, ulong time, const int index) {     std::string line;     boost::smatch what;     if (std::getline(vifs[index].first, line))     {         if (boost::regex_search(line, what, expression))         {             time = time2milleseconds(what[1]);         }         myheap.push(data(index, line, time));     } }  void mergefiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<data, std::vector<data>, compare> & myheap, std::ofstream & file) {     int index = 0;     ulong time = 0;     while (!myheap.empty())     {         index = myheap.top().getindex();         time = myheap.top().gettime();         file << myheap.top().getline() << " --> " << vifs[index].second << "\n";         myheap.pop();         checkregex(vifs, myheap, time, index);     } }  int main() {     clock_t begin = clock();     int cnt = std::count_if(fs::directory_iterator(path), fs::directory_iterator(), static_cast<bool(*)(const fs::path&)>(fs::is_regular_file));     std::priority_queue<data, std::vector<data>, compare> myheap;     std::vector<std::pair<std::ifstream, std::string>> vifs(cnt);     int index = 0;     boost::smatch what;     std::string file;     (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d)     {         if (fs::is_regular_file(d->path()))         {             file = d->path().filename().string();             if (boost::regex_search(file, what, namefileex))             {                 vifs[index] = std::make_pair(std::ifstream(d->path().string()), what[1]);                 initializeheap(vifs[index].first, myheap, index);                 ++index;             }         }     }     std::ofstream filename(path + "\\testlog.txt");     mergefiles(vifs, myheap, filename);     filename.close();     clock_t end = clock();     double elapsed_secs = double(end - begin) / clocks_per_sec;     std::cout << "elapsed time = " << elapsed_secs << "\n";     return 0; } 

after work, realized yesterday run program in debug. launching both implementations in release, got following results:

  • vector implementation: 25 seconds
  • heap implementation: 27 seconds

thus, or implementation heap structure not optimized or 2 implementation equal in running time.

is there else can speed execution?

this can done faster , low memory. consider first:

  • read single line each file (so n lines in memory @ 1 time).
  • find smallest of n lines, output it.
  • in memory, replace value output next line file current output came (take care of eof case).

if m length of output file (ie length of logs combined), trivial implementation o(n * m).

however, above can improved using heap, reduces time o(m log n). is, put n in-memory elements on heap. pop off top 1 output smallest element. then, when read new line, throw line on heap.


Comments