rgrep

Cargo.toml

Cargo.toml

[package]
name = "rgrep"
version = "0.1.0"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
anyhow = "1"
clap = { version = "3", features = ["derive"] }
colored = "2"
glob = "0.3"
itertools = "0.10"
rayon = "1"
regex = "1"
thiserror = "1"

src/error.rs: thiserror会自动转换

它们都是需要进行转换的错误。thiserror 能够通过宏帮我们完成错误类型的转换。

use thiserror::Error;

#[derive(Error, Debug)]
pub enum GrepError {
    #[error("Glob pattern error")]
    GlobPatternError(#[from] glob::PatternError),
    #[error("Regex pattern error")]
    RegexPatternError(#[from] regex::Error),
    #[error("I/O error")]
    IoError(#[from] std::io::Error),
}

src/lib.rs:定义结构体+实现方法+单元测试

mod引入与使用

mod error;
pub use error::GrepError;

定义结构体: 专门简化复杂类型

这里其实就是传入一个指定结构的函数对象

/// 定义类型,这样,在使用时可以简化复杂类型的书写
pub type StrategyFn = fn(&Path, &mut dyn BufRead, &Regex, &mut dyn Write) -> Result<(), GrepError>;

专门的结合版本grep结构体

/// 简化版本的 grep,支持正则表达式和文件通配符
#[derive(Parser, Debug)]
#[clap(version = "1.0", author = "Tyr Chen <tyr@chen.com>")]
pub struct GrepConfig {
    /// 用于查找的正则表达式
    pattern: String,
    /// 文件通配符
    glob: String,
}

lib.rs: 给结构体实现方法

impl GrepConfig {
    /// 使用缺省策略来查找匹配
    pub fn match_with_default_strategy(&self) -> Result<(), GrepError> {
        self.match_with(default_strategy)
    }

    /// 使用某个策略函数来查找匹配
    pub fn match_with(&self, strategy: StrategyFn) -> Result<(), GrepError> {
        let regex = Regex::new(&self.pattern)?;
        // 生成所有符合通配符的文件列表
        let files: Vec<_> = glob::glob(&self.glob)?.collect();
        // 并行处理所有文件
        files.into_par_iter().for_each(|v| {
            if let Ok(filename) = v {
                if let Ok(file) = File::open(&filename) {
                    let mut reader = BufReader::new(file);
                    let mut stdout = io::stdout();

                    if let Err(e) = strategy(filename.as_path(), &mut reader, &regex, &mut stdout) {
                        println!("Internal error: {:?}", e);
                    }
                }
            }
        });
        Ok(())
    }
}

主要实现两种解析策略:

  1. 默认策略:match_with_default_strategy, 使用default_strategy
  2. 指定策略:match_with, 使用传入的strategy: StrategyFn

默认策略: default_strategy

/// 缺省策略,从头到尾串行查找,最后输出到 writer
pub fn default_strategy(
    path: &Path,
    reader: &mut dyn BufRead,
    pattern: &Regex,
    writer: &mut dyn Write,
) -> Result<(), GrepError> {
    let matches: String = reader
        .lines()
        .enumerate()
        .map(|(lineno, line)| {
            line.ok()
                .map(|line| {
                    pattern
                        .find(&line)
                        .map(|m| format_line(&line, lineno + 1, m.range()))
                })
                .flatten()
        })
        .filter_map(|v| v.ok_or(()).ok())
        .join("\n");

    if !matches.is_empty() {
        writer.write_all(path.display().to_string().green().as_bytes())?;
        writer.write_all(b"\n")?;
        writer.write_all(matches.as_bytes())?;
        writer.write_all(b"\n")?;
    }

    Ok(())
}

格式化输出

/// 格式化输出匹配的行,包含行号,列号和带有高亮的第一个匹配项
pub fn format_line(line: &str, lineno: usize, range: Range<usize>) -> String {
    let Range { start, end } = range;
    let prefix = &line[..start];
    format!(
        "{0: >6}:{1: <3} {2}{3}{4}",
        lineno.to_string().blue(),
        // 找到匹配项的起始位置,注意对汉字等非 ascii 字符,我们不能使用 prefix.len()
        // 这是一个 O(n) 的操作,会拖累效率,这里只是为了演示的效果
        (prefix.chars().count() + 1).to_string().cyan(),
        prefix,
        &line[start..end].red(),
        &line[end..]
    )
}

单元测试

#[cfg(test)]
mod tests {

    use super::*;

    #[test]
    fn format_line_should_work() {
        let result = format_line("Hello, Tyr~", 1000, 7..10);
        let expected = format!(
            "{0: >6}:{1: <3} Hello, {2}~",
            "1000".blue(),
            "8".cyan(),
            "Tyr".red()
        );
        assert_eq!(result, expected);
    }

    #[test]
    fn default_strategy_should_work() {
        let path = Path::new("src/main.rs");
        let input = b"hello world!\nhey Tyr!";
        let mut reader = BufReader::new(&input[..]);
        let pattern = Regex::new(r"he\w+").unwrap();
        let mut writer = Vec::new();
        default_strategy(path, &mut reader, &pattern, &mut writer).unwrap();
        let result = String::from_utf8(writer).unwrap();
        let expected = [
            String::from("src/main.rs"),
            format_line("hello world!", 1, 0..5),
            format_line("hey Tyr!\n", 2, 0..3),
        ];

        assert_eq!(result, expected.join("\n"));
    }
}

src/main.rs

引入lib.rs中的内容

use itertools::Itertools;

主函数:main()

use regex::Regex;
use std::{
    fs::File,
    io::{self, BufRead, BufReader, Write},
    ops::Range,
    path::Path,
};

使用

示例:cargo run –quiet – “正则表达式” “src/*.rs”

cargo run --quiet -- "Re[^\\s]+" "src/*.rs"                                                                                                                                                                                            ─╯
src/main.rs
     1:13  use anyhow::Result;
     5:14  fn main() -> Result<()> {
src/error.rs
     7:14      #[error("Regex pattern error")]
     8:5       RegexPatternError(#[from] regex::Error),
src/lib.rs
     5:12  use regex::Regex;
     8:19      io::{self, BufRead, BufReader, Write},
    17:45  pub type StrategyFn = fn(&Path, &mut dyn BufRead, &Regex, &mut dyn Write) -> Result<(), GrepError>;
    31:50      pub fn match_with_default_strategy(&self) -> Result<(), GrepError> {
    36:55      pub fn match_with(&self, strategy: StrategyFn) -> Result<(), GrepError> {
    37:21          let regex = Regex::new(&self.pattern)?;
    44:41                      let mut reader = BufReader::new(file);
    60:25      reader: &mut dyn BufRead,
    61:15      pattern: &Regex,
    63:6   ) -> Result<(), GrepError> {
   126:29          let mut reader = BufReader::new(&input[..]);
   127:23          let pattern = Regex::new(r"he\w+").unwrap();