Skip to content

Training Data Cleaner

Calcitem edited this page Nov 28, 2022 · 1 revision
#include <fstream>
#include <iostream>
#include <string>
#include <vector>

using namespace std;

int main()
{
    vector<string> vec;

    const string inputFile = "D:\\repos\\merger\\all.txt";
    const string outputFile = "D:\\repos\\merger\\clean.txt";

    ifstream file(inputFile);

    if (!file.is_open()) {
        return -1;
    }

    std::string line;

    bool repeat = false;

    while (getline(file, line)) {
        if (line == "") {
            vec.push_back(line);
        } else {
            size_t size = vec.size();
            for (int i = 0; i < size; i++) {
                if (line == vec[i]) {
                    repeat = true;
                    cout << "#";
                    goto out;
                }
            }
        out:
            if (repeat == false) {
                vec.push_back(line);

                cout << "*";
            }

            repeat = false;
        }
    }

    file.close();

    cout << endl;

    ofstream ofile(outputFile);

    if (!ofile.is_open()) {
        return -1;
    }

    size_t size = vec.size();
    for (int i = 0; i < size; i++) {
        ofile << vec[i] << "\n";
        cout << ">";
    }

    ofile.close();

    cout << endl;

    cout << "Done." << endl;
    system("pause");
    return 0;
}
Clone this wiki locally