c++ 图论学习3

使用 C++ 来实现一个针对企业内部文件的网络爬虫,尽可能利用 C++的新特性和图的遍历算法


#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
 
namespace fs = std::filesystem;
 
class InternalCrawler {
private:
    std::unordered_set visited;
    std::queue to_visit;
    std::mutex mtx;
    std::atomic<bool> running{true};
    std::vector threads;
    std::regex file_pattern;
    fs::path root_dir;
 
    void crawl_file(const fs::path& file_path) {
        std::ifstream file(file_path);
        std::string line;
        while (std::getline(file, line)) {
            std::smatch match;
            if (std::regex_search(line, match, file_pattern)) {
                std::string new_file = match.str();
                fs::path new_path = root_dir / new_file;
                if (fs::exists(new_path)) {
                    std::lock_guard lock(mtx);
                    if (visited.find(new_file) == visited.end()) {
                        to_visit.push(new_file);
                        visited.insert(new_file);
                    }
                }
            }
        }
    }
 
    void worker() {
        while (running) {
            std::string current_file;
            {
                std::lock_guard lock(mtx);
                if (to_visit.empty()) {
                    std::this_thread::sleep_for(std::chrono::milliseconds(100));
                    continue;
                }
                current_file = to_visit.front();
                to_visit.pop();
            }
            crawl_file(root_dir / current_file);
        }
    }
 
public:
    InternalCrawler(const fs::path& root, const std::string& pattern, int num_threads = 4)
        : root_dir(root), file_pattern(pattern) {
        for (int i = 0; i < num_threads; ++i) {
            threads.emplace_back(&InternalCrawler::worker, this);
        }
    }
 
    void start(const std::string& start_file) {
        to_visit.push(start_file);
        visited.insert(start_file);
    }
 
    void stop() {
        running = false;
        for (auto& thread : threads) {
            thread.join();
        }
    }
 
    void print_visited() {
        for (const auto& file : visited) {
            std::cout << file << std::endl;
        }
    }
};
 
int main() {
    fs::path root_dir = "/path/to/your/document/root";
    std::string file_pattern = R"(\b[a-zA-Z0-9_-]+\.(txt|doc|pdf)\b)";
    InternalCrawler crawler(root_dir, file_pattern);
 
    crawler.start("start_document.txt");
    
    // 让爬虫运行一段时间
    std::this_thread::sleep_for(std::chrono::seconds(30));
    
    crawler.stop();
    crawler.print_visited();
 
    return 0;
}

这个实现使用了以下 C++新特性和图的遍历算法:

使用 std::filesystem (C++17) 进行文件系统操作。
使用 std::thread 和 std::mutex (C++11) 实现多线程爬虫。
使用 std::atomic (C++11) 实现线程安全的控制标志。
使用 std::regex (C++11) 进行文件名匹配。
使用 std::unordered_set 存储已访问的文件,实现 O(1) 的查找复杂度。
使用广度优先搜索 (BFS) 算法遍历文件网络图。

这个爬虫的工作原理如下:

从指定的起始文件开始。
使用多个线程并行处理文件。
每个线程从队列中取出一个文件,读取其内容,并使用正则表达式查找其他文件的引用。
如果找到新的文件引用,检查该文件是否存在,如果存在且未被访问过,则加入队列。
使用互斥锁保护共享数据结构(visited集合和to_visit队列)。
继续这个过程,直到所有文件都被处理或手动停止爬虫。

为了方便理解下面为每一行代码都配上了注释


#include       // 用于输入输出
#include         // 用于存储线程
#include          // 用于存储待访问的文件
#include  // 用于存储已访问的文件
#include     // 用于文件系统操作
#include        // 用于文件读取
#include         // 用于多线程支持
#include          // 用于线程同步
#include         // 用于原子操作
#include         // 用于时间相关操作
#include          // 用于正则表达式
 
namespace fs = std::filesystem; // 为std::filesystem创建别名
 
class InternalCrawler {
private:
    std::unordered_set visited; // 存储已访问的文件
    std::queue to_visit;        // 存储待访问的文件
    std::mutex mtx;                          // 用于保护共享数据的互斥锁
    std::atomic<bool> running{true};         // 控制爬虫运行状态的原子变量
    std::vector threads;        // 存储工作线程
    std::regex file_pattern;                 // 用于匹配文件名的正则表达式
    fs::path root_dir;                       // 文档根目录
 
    void crawl_file(const fs::path& file_path) {
        std::ifstream file(file_path);       // 打开文件
        std::string line;
        while (std::getline(file, line)) {   // 逐行读取文件内容
            std::smatch match;
            if (std::regex_search(line, match, file_pattern)) { // 在行中搜索匹配的文件名
                std::string new_file = match.str();
                fs::path new_path = root_dir / new_file; // 构造新文件的完整路径
                if (fs::exists(new_path)) {  // 检查文件是否存在
                    std::lock_guard lock(mtx); // 锁定互斥锁
                    if (visited.find(new_file) == visited.end()) { // 检查文件是否已被访问
                        to_visit.push(new_file);  // 将新文件加入待访问队列
                        visited.insert(new_file); // 将新文件标记为已访问
                    }
                }
            }
        }
    }
 
    void worker() {
        while (running) { // 当爬虫正在运行时
            std::string current_file;
            {
                std::lock_guard lock(mtx); // 锁定互斥锁
                if (to_visit.empty()) { // 如果没有待访问的文件
                    std::this_thread::sleep_for(std::chrono::milliseconds(100)); // 休眠一段时间
                    continue;
                }
                current_file = to_visit.front(); // 获取队列前端的文件
                to_visit.pop(); // 从队列中移除该文件
            }
            crawl_file(root_dir / current_file); // 处理当前文件
        }
    }
 
public:
    InternalCrawler(const fs::path& root, const std::string& pattern, int num_threads = 4)
        : root_dir(root), file_pattern(pattern) {
        for (int i = 0; i < num_threads; ++i) {
            threads.emplace_back(&InternalCrawler::worker, this); // 创建工作线程
        }
    }
 
    void start(const std::string& start_file) {
        to_visit.push(start_file);   // 将起始文件加入待访问队列
        visited.insert(start_file);  // 将起始文件标记为已访问
    }
 
    void stop() {
        running = false; // 停止爬虫运行
        for (auto& thread : threads) {
            thread.join(); // 等待所有工作线程结束
        }
    }
 
    void print_visited() {
        for (const auto& file : visited) {
            std::cout << file << std::endl; // 打印所有已访问的文件
        }
    }
};
 
int main() {
    fs::path root_dir = "/path/to/your/document/root"; // 设置文档根目录
    std::string file_pattern = R"(\b[a-zA-Z0-9_-]+\.(txt|doc|pdf)\b)"; // 设置文件名匹配模式
    InternalCrawler crawler(root_dir, file_pattern); // 创建爬虫实例
 
    crawler.start("start_document.txt"); // 开始爬取，从start_document.txt开始
    
    // 让爬虫运行30秒
    std::this_thread::sleep_for(std::chrono::seconds(30));
    
    crawler.stop(); // 停止爬虫
    crawler.print_visited(); // 打印所有已访问的文件
 
    return 0;
}

相关阅读:
Servlet快速入门
 import.meta.glob() 如何导入多个目录下的资源
 Java&数组
 Spring中什么样的Bean存在线程安全问题-有状态bean
最受欢迎的编程语言排行榜
 算法与数据结构【30天】集训营——线性表的定义及特点-顺序表的表示与实现及操作（03）
【NDVI：注意力机制：遥感图像】
关于api设计的一些思考-restful理想主义与商业需求的无解冲突
 UE里的Sync Groups
【生成对抗网络论文泛读】……StarganV1 & StarganV2……
原文地址：https://blog.csdn.net/m0_61862494/article/details/140046617