i have n different log files coming n different services running on our device. want merge n files single file maintaining chronological order. file size can vary few kb gb.
the n log files have same format, , like:
********** logging session started ************ * hmsoa version: 2.4.0.12 * exe path: c:\program files (x86)\silicon biosystems\deparray300a_driver\deparray300a_driver.exe * exe version: 1.6.0.154 ************************************************ time = 2017/02/01 11:12:12,180 ; thid = 4924; cat = ; lvl = 1000; log = api 'connect'->enter; time = 2017/02/01 11:12:12,196 ; thid = 4924; cat = ; lvl = 1000; log = api 'connect'->exit=0; time = 2017/02/01 11:12:12,196 ; thid = 4924; cat = ; lvl = 1000; log = api 'ccisproxylocal connect - ok'->enter; time = 2017/02/01 11:12:12,196 ; thid = 4924; cat = ; lvl = 1000; log = api 'crecoveryaxesproxylocal connect - ok'->enter; time = 2017/02/01 11:12:12,196 ; thid = 4924; cat = ; lvl = 1000; log = api 'camplifierproxylocalv3 connect - ok'->enter; time = 2017/02/01 11:12:12,196 ; thid = 4924; cat = ; lvl = 1000; log = api 'system_diagnosis_get'->enter; time = 2017/02/01 11:12:12,211 ; thid = 4924; cat = ; lvl = 1000; log = api 'system_diagnosis_get'->exit=0; time = 2017/02/01 11:12:12,211 ; thid = 4924; cat = ; lvl = 1000; log = api 'lbl_square_set'->enter; time = 2017/02/01 11:12:12,219 ; thid = 4924; cat = ; lvl = 1000; log = api 'lbl_square_set'->exit=0;
since have n different files, did far apply external sorting algorithm reading single line each file:
#include "stdafx.h" #include "boost/regex.hpp" #include "boost/lexical_cast.hpp" #include "boost\filesystem.hpp" #include <string> #include <fstream> #include <iostream> #include <algorithm> #include <sstream> #include <climits> #include <ctime> namespace fs = boost::filesystem; static const boost::regex expression(r"(^(?:(?:time\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))"); static const boost::regex namefileex(r"(^[\d\-\_]+(\w+\s?\w+|\w+))"); static const std::string path("e:\\2017-02-01"); //static const std::string path("e:\\testlog"); unsigned long time2milleseconds(const std::string & time) { int a, b, c, d; if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3) return * 3600000 + b * 60000 + c * 1000 + d; } void readallfilesuntilline7(std::vector<std::pair<std::ifstream, std::string>> & vifs) { std::string line; (int = 0; < vifs.size(); ++i) { int linenumber = 0; while (linenumber != 7 && std::getline(vifs[i].first, line)) { ++linenumber; } } } void checkregex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::vector<unsigned long> & logtime, std::vector<std::string> & lines, int index, int & counter) { std::string line; boost::smatch what; if (std::getline(vifs[index].first, line)) { if (boost::regex_search(line, what, expression)) { logtime[index] = time2milleseconds(what[1]); } lines[index] = line; } else { --counter; logtime[index] = ulong_max; } } void mergefiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::vector<unsigned long> & logtime, std::vector<std::string> & lines, std::ofstream & file, int & counter) { std::string line; boost::smatch what; int index = 0; (int = 0; < vifs.size(); ++i) { checkregex(vifs, logtime, lines, i, counter); } index = min_element(logtime.begin(), logtime.end()) - logtime.begin(); file << lines[index] << " --> " << vifs[index].second << "\n"; while (true) { checkregex(vifs, logtime, lines, index, counter); index = min_element(logtime.begin(), logtime.end()) - logtime.begin(); if (0 == counter) break; file << lines[index] << " --> " << vifs[index].second << "\n"; } } int main() { clock_t begin = clock(); int cnt = std::count_if(fs::directory_iterator(path),fs::directory_iterator(),static_cast<bool(*)(const fs::path&)>(fs::is_regular_file)); std::vector<std::pair<std::ifstream, std::string>> vifs(cnt); int index = 0; boost::smatch what; std::string file; (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d) { if (fs::is_regular_file(d->path())) { file = d->path().filename().string(); if (boost::regex_search(file, what, namefileex)) { vifs[index++] = std::make_pair(std::ifstream(d->path().string()), what[1]); } } } std::vector<unsigned long> logtime(cnt, ulong_max); std::vector<std::string> lines(cnt); std::ofstream filename(path + "\\testlog.txt"); readallfilesuntilline7(vifs); mergefiles(vifs, logtime, lines, filename, cnt); filename.close(); clock_t end = clock(); double elapsed_secs = double(end - begin) / clocks_per_sec; std::cout << "elapsed time = " << elapsed_secs << "\n"; return 0; }
it supposed slow. merge 82 files sizes ranging 1 kb 250 mb , create final file more 6000000 lines takes 70 minutes.
how can speed algorithm? appreciated!
update
i have implemented version heap, too:
data.h:
#pragma once #include <string> class data { public: data(dword index, const std::string & line, ulong time); ~data(); inline const ulong gettime() const {return time; } inline const dword getindex() const { return index; } inline const std::string getline() const { return line; } private: dword index; std::string line; ulong time; }; class compare { public: bool operator()(const data & lhs, const data & rhs) { return lhs.gettime() > rhs.gettime(); }; };
data.cpp:
#include "stdafx.h" #include "data.h" data::data(dword i_index, const std::string & i_line, ulong i_time) : index(i_index) , line(i_line) , time(i_time) { } data::~data() { }
main.cpp:
#include "stdafx.h" #include "boost/regex.hpp" #include "boost/lexical_cast.hpp" #include "boost\filesystem.hpp" #include <string> #include <fstream> #include <iostream> #include <algorithm> #include <sstream> #include <climits> #include <ctime> #include <queue> #include "data.h" namespace fs = boost::filesystem; static const boost::regex expression(r"(^(?:(?:time\s=\s\d{4}\/\d{2}\/\d{2}\s)|(?:@))([0-9:.,]+))"); static const boost::regex namefileex(r"(^[\d\-\_]+(\w+\s?\w+|\w+))"); static const std::string path("e:\\2017-02-01"); //static const std::string path("e:\\testlog"); unsigned long time2milleseconds(const std::string & time) { int a, b, c, d; if (sscanf_s(time.c_str(), "%d:%d:%d,%d", &a, &b, &c, &d) >= 3) return * 3600000 + b * 60000 + c * 1000 + d; } void initializeheap(std::ifstream & ifs, std::priority_queue<data, std::vector<data>, compare> & myheap, const int index) { ulong time; std::string line; boost::smatch what; bool match = false; while (!match && std::getline(ifs, line)) { if (boost::regex_search(line, what, expression)) { time = time2milleseconds(what[1]); myheap.push(data(index, line, time)); match = true; } } } void checkregex(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<data, std::vector<data>, compare> & myheap, ulong time, const int index) { std::string line; boost::smatch what; if (std::getline(vifs[index].first, line)) { if (boost::regex_search(line, what, expression)) { time = time2milleseconds(what[1]); } myheap.push(data(index, line, time)); } } void mergefiles(std::vector<std::pair<std::ifstream, std::string>> & vifs, std::priority_queue<data, std::vector<data>, compare> & myheap, std::ofstream & file) { int index = 0; ulong time = 0; while (!myheap.empty()) { index = myheap.top().getindex(); time = myheap.top().gettime(); file << myheap.top().getline() << " --> " << vifs[index].second << "\n"; myheap.pop(); checkregex(vifs, myheap, time, index); } } int main() { clock_t begin = clock(); int cnt = std::count_if(fs::directory_iterator(path), fs::directory_iterator(), static_cast<bool(*)(const fs::path&)>(fs::is_regular_file)); std::priority_queue<data, std::vector<data>, compare> myheap; std::vector<std::pair<std::ifstream, std::string>> vifs(cnt); int index = 0; boost::smatch what; std::string file; (fs::directory_iterator d(path); d != fs::directory_iterator(); ++d) { if (fs::is_regular_file(d->path())) { file = d->path().filename().string(); if (boost::regex_search(file, what, namefileex)) { vifs[index] = std::make_pair(std::ifstream(d->path().string()), what[1]); initializeheap(vifs[index].first, myheap, index); ++index; } } } std::ofstream filename(path + "\\testlog.txt"); mergefiles(vifs, myheap, filename); filename.close(); clock_t end = clock(); double elapsed_secs = double(end - begin) / clocks_per_sec; std::cout << "elapsed time = " << elapsed_secs << "\n"; return 0; }
after work, realized yesterday run program in debug. launching both implementations in release, got following results:
- vector implementation: 25 seconds
- heap implementation: 27 seconds
thus, or implementation heap structure not optimized or 2 implementation equal in running time.
is there else can speed execution?
this can done faster , low memory. consider first:
- read single line each file (so
n
lines in memory @ 1 time). - find smallest of
n
lines, output it. - in memory, replace value output next line file current output came (take care of eof case).
if m
length of output file (ie length of logs combined), trivial implementation o(n * m)
.
however, above can improved using heap, reduces time o(m log n)
. is, put n
in-memory elements on heap. pop off top 1 output smallest element. then, when read new line, throw line on heap.
Comments
Post a Comment