From 4b75defc8d9db21f08695f145de06baf035a49c2 Mon Sep 17 00:00:00 2001 From: Tormyst Date: Mon, 3 Dec 2018 18:07:34 -0400 Subject: [PATCH 01/17] Replacing the boilerplate go code with a Rust Cargo project. Also implemented += on result as part of this change. --- .gitignore | 2 ++ Cargo.lock | 4 +++ Cargo.toml | 6 +++++ Gopkg.lock | 9 ------- Gopkg.toml | 22 --------------- main.go | 37 ------------------------- result.go | 50 ---------------------------------- src/main.rs | 45 +++++++++++++++++++++++++++++++ src/result.rs | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 132 insertions(+), 118 deletions(-) create mode 100644 .gitignore create mode 100644 Cargo.lock create mode 100644 Cargo.toml delete mode 100644 Gopkg.lock delete mode 100644 Gopkg.toml delete mode 100644 main.go delete mode 100644 result.go create mode 100644 src/main.rs create mode 100644 src/result.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..53eaa21 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +**/*.rs.bk diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..fbea8ec --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,4 @@ +[[package]] +name = "clever-challenge" +version = "0.1.0" + diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..e1c01f3 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "clever-challenge" +version = "0.1.0" +authors = ["Tormyst "] + +[dependencies] diff --git a/Gopkg.lock b/Gopkg.lock deleted file mode 100644 index bef2d00..0000000 --- a/Gopkg.lock +++ /dev/null @@ -1,9 +0,0 @@ -# This file is autogenerated, do not edit; changes may be undone by the next 'dep ensure'. - - -[solve-meta] - analyzer-name = "dep" - analyzer-version = 1 - inputs-digest = "ab4fef131ee828e96ba67d31a7d690bd5f2f42040c6766b1b12fe856f87e0ff7" - solver-name = "gps-cdcl" - solver-version = 1 diff --git a/Gopkg.toml b/Gopkg.toml deleted file mode 100644 index 9425a54..0000000 --- a/Gopkg.toml +++ /dev/null @@ -1,22 +0,0 @@ - -# Gopkg.toml example -# -# Refer to https://github.com/golang/dep/blob/master/docs/Gopkg.toml.md -# for detailed Gopkg.toml documentation. -# -# required = ["github.com/user/thing/cmd/thing"] -# ignored = ["github.com/user/project/pkgX", "bitbucket.org/user/project/pkgA/pkgY"] -# -# [[constraint]] -# name = "github.com/user/project" -# version = "1.0.0" -# -# [[constraint]] -# name = "github.com/user/project2" -# branch = "dev" -# source = "github.com/myfork/project2" -# -# [[override]] -# name = "github.com/x/y" -# version = "2.4.0" - diff --git a/main.go b/main.go deleted file mode 100644 index f62aa00..0000000 --- a/main.go +++ /dev/null @@ -1,37 +0,0 @@ -package main - -import ( - "fmt" - "time" -) - -//timeTrack tracks the time it took to do things. -//It's a convenient method that you can use everywhere -//you feel like it -func timeTrack(start time.Time, name string) { - elapsed := time.Since(start) - fmt.Printf("%s took %s\n", name, elapsed) -} - -//main is the entry point of our go program. It defers -//the execution of timeTrack so we can know how long it -//took for the main to complete. -//It also calls the compute and output the returned struct -//to stdout. -func main() { - defer timeTrack(time.Now(), "compute diff") - fmt.Println(compute()) -} - -//compute parses the git diffs in ./diffs and returns -//a result struct that contains all the relevant informations -//about these diffs -// list of files in the diffs -// number of regions -// number of line added -// number of line deleted -// list of function calls seen in the diffs and their number of calls -func compute() *result { - - return nil -} diff --git a/result.go b/result.go deleted file mode 100644 index 7e78236..0000000 --- a/result.go +++ /dev/null @@ -1,50 +0,0 @@ -package main - -import ( - "bytes" - "strconv" -) - -//result contains an analysis of a set of commit -type result struct { - //The name of the files seen - files []string - //How many region we have (i.e. seperated by @@) - regions int - //How many line were added total - lineAdded int - //How many line were deleted totla - lineDeleted int - //How many times the function seen in the code are called. - functionCalls map[string]int -} - -//String returns the value of results as a formated string -func (r *result) String() string { - - var buffer bytes.Buffer - buffer.WriteString("Files: \n") - for _, file := range r.files { - buffer.WriteString(" -") - buffer.WriteString(file) - buffer.WriteString("\n") - } - r.appendIntValueToBuffer(r.regions, "Regions", &buffer) - r.appendIntValueToBuffer(r.lineAdded, "LA", &buffer) - r.appendIntValueToBuffer(r.lineDeleted, "LD", &buffer) - - buffer.WriteString("Functions calls: \n") - for key, value := range r.functionCalls { - r.appendIntValueToBuffer(value, key, &buffer) - } - - return buffer.String() -} - -//appendIntValueToBuffer appends int value to a bytes buffer -func (r result) appendIntValueToBuffer(value int, label string, buffer *bytes.Buffer) { - buffer.WriteString(label) - buffer.WriteString(" : ") - buffer.WriteString(strconv.Itoa(value)) - buffer.WriteString("\n") -} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..a4c335e --- /dev/null +++ b/src/main.rs @@ -0,0 +1,45 @@ +#![allow(non_snake_case)] +use std::time::Instant; + +mod result; +use result::Result; + +// timeTrack has been modified in the conversion from go to Rust. +// Unfortunetly, the time library has many direct time functions +// still marked as nightly. +fn timeTrack(start: Instant, string: &'static str) { + let elapsed = start.elapsed(); + print!("{} took {} seconds and {} nanoseconds\n", + string, + elapsed.as_secs(), + elapsed.subsec_nanos()); +} + +// main is the entry point of our Rust program. +// +// The go line: defer timeTrack(time.Now(), "compute diff") +// translates as our first and last lines. Rust has no defer +// as that is memory that is held for the duration of the function +// secretly and against the visibility that rust strives for. +// +// It also calls the compute and Display method of the returned struct +// to stdout. +fn main() { + let now = Instant::now(); + + println!("{}", compute()); + + timeTrack(now, "compute diff"); +} + +// compute parses the git diffs in ./diffs and returns +// a result struct that contains all the relevant informations +// about these diffs +// list of files in the diffs +// number of regions +// number of line added +// number of line deleted +// list of function calls seen in the diffs and their number of calls +fn compute() -> Result { + Result::empty() +} diff --git a/src/result.rs b/src/result.rs new file mode 100644 index 0000000..51f0eec --- /dev/null +++ b/src/result.rs @@ -0,0 +1,75 @@ +use std::fmt; +use std::collections::{HashSet, HashMap}; +use std::ops::AddAssign; + +// result contains an analysis of a set of commits +pub struct Result { + // The name of the files seen + files: HashSet, + // How many regions we have (i.e. seperated by @@) + regions: usize, + // How many lines were added total + lineAdded: usize, + // How many lines were deleted total + lineDeleted: usize, + // How many times the function seen in the code are called + functionCalls: HashMap, +} + +impl Result { + // Creates an empty Result struct. + pub fn empty() -> Self { + Self::new(HashSet::new(), 0, 0, 0, HashMap::new()) + } + + // Constructor for a Result struct that expects everything to be handed to it. + pub fn new(files: HashSet, + regions: usize, + lineAdded: usize, + lineDeleted: usize, + functionCalls: HashMap) + -> Self { + Result { + files, + regions, + lineAdded, + lineDeleted, + functionCalls, + } + } +} + +// Implementation of the formating system in rust ensuring that the structure may be +// printed in a way close to that of the print statment provided in the go structure. +impl fmt::Display for Result { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Files: \n")?; + for file in &self.files { + write!(f, " -{}\n", file)?; + } + write!(f, "Regions: {}\n", self.regions)?; + write!(f, "LA: {}\n", self.lineAdded)?; + write!(f, "LD: {}\n", self.lineDeleted)?; + + write!(f, "Functions calls: \n")?; + for (key, value) in &self.functionCalls { + write!(f, " {}: {}\n", key, value)?; + } + Ok(()) + } +} + +// Iplementation of the += opperator which is useful syntacticaly to clearly demonstrait +// how results are combined. +impl AddAssign for Result { + fn add_assign(&mut self, mut other: Result) { + other.files.drain().for_each(|file| {self.files.insert(file);}); + self.regions += other.regions; + self.lineAdded += other.lineAdded; + self.lineDeleted += other.lineDeleted; + other.functionCalls.drain().for_each(|(key, value)| { + let to_add = *self.functionCalls.get(&key).unwrap_or(&0); + self.functionCalls.insert(key, value + to_add); + }); + } +} From 00718b5cd16a96f0425e13643bc6774398599f7d Mon Sep 17 00:00:00 2001 From: Tormyst Date: Mon, 3 Dec 2018 18:56:50 -0400 Subject: [PATCH 02/17] Updating readme --- README.md | 61 ++++++++++--------------------------------------------- 1 file changed, 11 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index b309fbc..72c91f4 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,6 @@ -# Clever Initiative Challenge +# Clever Initiative Challenge Implementation -The Clever-Initiative is a team of the Technology Group @ Ubisoft Montreal. Our goal is to discover, improve and promote state-of-the-art software engineering practices that aim to ease the production of quality. By augmenting the quality of our products, we mechanically improve the productivity of our teams as they can focus creating features rather than fixing bugs. - -The Clever-Initiative is the team behind the [Submit-Assitant](https://montreal.ubisoft.com/en/ubisoft-la-forge-presents-the-commit-assistant/) (a.k.a Clever-Commit) that received some press coverage recently: [Wired](http://www.wired.co.uk/article/ubisoft-commit-assist-ai), [MIT](https://www.technologyreview.com/the-download/610416/ai-can-help-spot-coding-mistakes-before-they-happen/), and [more](https://www.google.ca/search?q=commit+assistant+ubisoft). The scientific foundation behind our work have been accepted for publication to [MSR'18](https://montreal.ubisoft.com/en/clever-combining-code-metrics-with-clone-detection-for-just-in-time-fault-prevention-and-resolution-in-large-industrial-projects-2/), [CPPCON'18](https://www.youtube.com/watch?v=QDvic0QNtOY). - -We are currently looking for trainees to join us (Winter'19). The length and start date of the internship will be discussed on a per applicant basis. - -## Trainees - -A trainee applicant must: - -- Be engaged in a computer science (or related) university program. -- Be able to work in Canada legally. -- Be willing to come to Montreal. -- Be able to read, understand and implement scientific papers. -- Know: - - versionning systems (git, perforce, ...) - - c/c++/csharp or java -- Know or be willing to learn: - - golang - - docker - - sql - - angular +Within this repo, you will find an implementation of the Clever Initiative Challenge written in rust. The focus will be to keep the goals for extendability, maintainability and efficiency during the process of solving the problem. ## The Challenge @@ -35,38 +14,20 @@ The challenge for trainee applicant consists in parsing a few diffs--in the most All these stats are to be computed globally (i.e. for all the diffs combined). -In the main.go file; you'll find the `compute` method that needs to be implemented. - -```golang -//compute parses the git diffs in ./diffs and returns -//a result struct that contains all the relevant information -//about these diffs -// list of files in the diffs -// number of regions -// number of line added -// number of line deleted -// list of function calls seen in the diffs and their number of calls -func compute() *result { +## Permanent Positions - return nil -} -``` +I am applying for a permanent positions. I understand that this is meant for less permanent positions. However, as I was directly sent here, I will be completing this challenge. -To enter the challenge: +## Why rust? -- Fork this repository -- Implement your solution -- Open a pull request with your solution. In the body of the pull request, you can explain the choices you made if necessary. +I wish I could do this in Go. I don't believe in presenting something in a programing language while I am learning it. As I would be new to Go, I would feel it is not my best work. -You can alter the data structure, add files, remove files ... you can even start from scratch in another language if you feel like it. -However, note that we do use golang internally. +While I am new to rust, my major Rust side project [FeGaBo](github.com/tormyst/FeGaBo) has given me enough experience with the language. -If you don't feel comfortable sharing your pull-request with the world (pull-request are public) you can invite me (@mathieunls for github, bitbucket and gitlab) and Florent Jousset (@heeko for github, bitbucket and gitlab) to a private repo of yours. Don't bother to send code by email, we won't read it. +Using rust has interesting benefits. While possible to make efficient solutions in other languages, the advantages that rust offers pushes a lot of work onto the compiler. Ensuring that a task can multithreaded is simple in rust as if the operation is not memory safe, Rust will not compile. Casts and mutability are brought forward ensuring only what needs to be done is. -## Permanent Positions +## Building this project -We are also looking for permanent members to join our team. If you are interested, mail our human resource [contact](mailto:alison.laplante-rayworth@ubisoft.com?subject=Clever-Initiative) with your resume. You can submit your pull request for the challenge. However, you'll be subjected to an in-depth (much harder) coding test. This one has been conceived for students only and it might not be worth your time to take it ;). +This is a `cargo` project running on nightly rust. -- [ ] Software Engineer / ML Dev (python, go, ml, sql, ...) -- [ ] Software Engineer / Backend Dev (go, c, cfg, ast, k8s, redis, ...) -- [ ] Software Engineer / Tool devs (csharp, python, cfg, ast, ...) +Once everything installed, you can use `cargo run` to run the solution From 697ca1c63215be07ac863ffd9e055fff685cd4e0 Mon Sep 17 00:00:00 2001 From: Tormyst Date: Mon, 3 Dec 2018 19:42:03 -0400 Subject: [PATCH 03/17] Basic implementation of compute This method has none of the diff checking properties, it was just ensuring that we read the directory for all diffs that need to be checked. They are temporaraly stored in the set of found files. --- src/main.rs | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index a4c335e..c21044d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,5 +1,7 @@ #![allow(non_snake_case)] use std::time::Instant; +use std::path::Path; +use std::collections::{HashMap, HashSet}; mod result; use result::Result; @@ -27,7 +29,7 @@ fn timeTrack(start: Instant, string: &'static str) { fn main() { let now = Instant::now(); - println!("{}", compute()); + println!("{}", compute().unwrap()); timeTrack(now, "compute diff"); } @@ -40,6 +42,18 @@ fn main() { // number of line added // number of line deleted // list of function calls seen in the diffs and their number of calls -fn compute() -> Result { - Result::empty() +fn compute() -> std::result::Result { + let mut retVal = Result::empty(); + let data_folder = Path::new("./diffs"); + + for entry in data_folder.read_dir()? { + let entry = entry?; + if entry.path().is_file() { + let mut files = HashSet::new(); + files.insert(entry.file_name().into_string().unwrap()); + retVal += Result::new(files, 0, 0, 0, HashMap::new()); + } + } + + Ok(retVal) } From edee72695551b32f9118a56b37e2ec6c9d39c3d9 Mon Sep 17 00:00:00 2001 From: Tormyst Date: Mon, 3 Dec 2018 19:43:14 -0400 Subject: [PATCH 04/17] Updating the formating of results.rs RustFmt got lazy and forgot these changes earlier. --- src/result.rs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/src/result.rs b/src/result.rs index 51f0eec..28a48bb 100644 --- a/src/result.rs +++ b/src/result.rs @@ -22,7 +22,7 @@ impl Result { Self::new(HashSet::new(), 0, 0, 0, HashMap::new()) } - // Constructor for a Result struct that expects everything to be handed to it. + // Constructor for a Result struct that expects everything to be handed to it. pub fn new(files: HashSet, regions: usize, lineAdded: usize, @@ -39,7 +39,7 @@ impl Result { } } -// Implementation of the formating system in rust ensuring that the structure may be +// Implementation of the formating system in rust ensuring that the structure may be // printed in a way close to that of the print statment provided in the go structure. impl fmt::Display for Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -63,13 +63,19 @@ impl fmt::Display for Result { // how results are combined. impl AddAssign for Result { fn add_assign(&mut self, mut other: Result) { - other.files.drain().for_each(|file| {self.files.insert(file);}); + other + .files + .drain() + .for_each(|file| { self.files.insert(file); }); self.regions += other.regions; self.lineAdded += other.lineAdded; self.lineDeleted += other.lineDeleted; - other.functionCalls.drain().for_each(|(key, value)| { - let to_add = *self.functionCalls.get(&key).unwrap_or(&0); - self.functionCalls.insert(key, value + to_add); - }); + other + .functionCalls + .drain() + .for_each(|(key, value)| { + let to_add = *self.functionCalls.get(&key).unwrap_or(&0); + self.functionCalls.insert(key, value + to_add); + }); } } From 253263445ca7824371c7a02bffc7f6173df88004 Mon Sep 17 00:00:00 2001 From: Tormyst Date: Tue, 4 Dec 2018 16:17:11 -0400 Subject: [PATCH 05/17] Adding a basic structure for opening a file --- src/diff.rs | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 10 +++++---- 2 files changed, 71 insertions(+), 4 deletions(-) create mode 100644 src/diff.rs diff --git a/src/diff.rs b/src/diff.rs new file mode 100644 index 0000000..d9a2ae9 --- /dev/null +++ b/src/diff.rs @@ -0,0 +1,65 @@ +use result::Result; // Result directly is our result. +use std::collections::{HashMap, HashSet}; +use std::fs::read; +use std::path::Path; + +/// Given a file, return statistical information about the file as if it were a diff file. +/// Returns Some Result if the file could be read and None if the reading failed. +/// +/// # Arguments +/// +/// `file` - The path to the file to be analized +/// +/// # Advice +/// +/// This will be the main function per file. +/// It is adviced to create a handler around this as this function is not taking exclusive +/// rights to the path. The path should be secured by the handler for the sake of returning +/// an error message if the path was invalid and no data could be read. +pub fn diffStats(file: &Path) -> Option { + if let Ok(_file_data) = read(file) { + let mut retVal = Result::new(HashSet::new(), 0, 0, 0, HashMap::new()); + + // TODO Process file + + Some(retVal) + } else { + None // During file error, we simply return nothing to indicate that the file has no contents instead of valid contents with nothing in it. + } +} + + +#[cfg(test)] +mod tests { + use std::path::Path; + + use result::Result; + use super::diffStats; + + #[test] + fn invaldFile() { + let path = Path::new("/INVALID"); + assert!(match diffStats(&path) { + None => true, + _ => false, + }); + } + + #[test] + fn emptyFile() { + use std::fs::{write, remove_file}; + let filename = "test.tmp"; // We create an empty file just to be sure it exists. + write(&filename, ""); + let path = Path::new(&filename); + assert!(match diffStats(&path) { + Some(_) => true, + _ => false, + }); + // TODO implementing PartialEq on result to improve this test to the next line. + // assert!(match diffStats(&path){ Some(data) => data == Result::empty(), _ => false}); + remove_file(&filename); // Cleanup that file so we don't keep creating different files. + + // TODO crate tempfile should be introduced in test setup to improve this test by + // removing the issue with creating and removing a file. + } +} diff --git a/src/main.rs b/src/main.rs index c21044d..7bf0632 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,11 +1,12 @@ #![allow(non_snake_case)] use std::time::Instant; use std::path::Path; -use std::collections::{HashMap, HashSet}; mod result; use result::Result; +mod diff; + // timeTrack has been modified in the conversion from go to Rust. // Unfortunetly, the time library has many direct time functions // still marked as nightly. @@ -49,9 +50,10 @@ fn compute() -> std::result::Result { for entry in data_folder.read_dir()? { let entry = entry?; if entry.path().is_file() { - let mut files = HashSet::new(); - files.insert(entry.file_name().into_string().unwrap()); - retVal += Result::new(files, 0, 0, 0, HashMap::new()); + match diff::diffStats(entry.path().as_path()) { + Some(result) => retVal += result, + None => eprintln!("Error reading stats for path {}. Are you sure it exists?", entry.path().display()) + } } } From ae15bae3070503e1f20181d713f0bc2989294d4b Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 13:31:09 -0400 Subject: [PATCH 06/17] Adding framework for diff processing. Implemented an increment for regions in Result. With that, region calculation is done --- src/diff.rs | 72 ++++++++++++++++++++++++++++++++++++++++++++++++--- src/result.rs | 4 +++ 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/src/diff.rs b/src/diff.rs index d9a2ae9..abcf209 100644 --- a/src/diff.rs +++ b/src/diff.rs @@ -17,10 +17,29 @@ use std::path::Path; /// rights to the path. The path should be secured by the handler for the sake of returning /// an error message if the path was invalid and no data could be read. pub fn diffStats(file: &Path) -> Option { - if let Ok(_file_data) = read(file) { + if let Ok(file_data) = read(file) { let mut retVal = Result::new(HashSet::new(), 0, 0, 0, HashMap::new()); - - // TODO Process file + let file_data = match String::from_utf8(file_data) { + Ok(data) => data, + Err(_) => { + eprintln!("Error: Could not encode file as utf8"); + return None; + } + }; + file_data + .lines() + .for_each(|line| { + match diff_type(line) { + DiffType::Header => {} + DiffType::Index => {} + DiffType::OriginalFile => {} + DiffType::NewFile => {} + DiffType::NewRegion => retVal.add_region(), + DiffType::FileLine => {} + DiffType::Addition => {} + DiffType::Subtraction => {} + }; + }); Some(retVal) } else { @@ -29,6 +48,53 @@ pub fn diffStats(file: &Path) -> Option { } +enum DiffType { + Header, + Index, + OriginalFile, + NewFile, + NewRegion, + FileLine, + Addition, + Subtraction, +} + +/// Returns the diff status of a line. There are a few states, including the different header +/// lines, normal file lines, additions, and subtractions. +/// +/// This will do the minimum work required to figure out what type a line is because it assumes the +/// string slice it is given starts at the begining of the line and that the file is a valid diff +/// file. Any unknown first letter of the line is treated as FileLine, as are empty lines. +/// +/// # Known Issues +/// +/// Currently, OriginalFile and NewFile are treeted as Addition and Subtraction lines. +/// This is an issue of only using the first letter. While it could be improved, there are edge +/// cases that cannot be resolved such as an additon line adding a line with 2 or more '+' +/// characters. To avoid this, this system uses a blind approch that is not attempting to solve +/// this issue. +/// +/// # Arguments +/// +/// `line` - A string slice of a line of a diff file. +/// +/// # Example +/// ``` +/// assert!(diff_type("diff --git ...") == DiffType::Header) +fn diff_type(line: &str) -> DiffType { + match line.chars().next().unwrap_or(' ') { + 'd' => DiffType::Header, + 'i' => DiffType::Index, + '@' => DiffType::NewRegion, + '+' => DiffType::Addition, + '-' => DiffType::Subtraction, + ' ' => DiffType::FileLine, + _ => DiffType::FileLine, + } +} + + + #[cfg(test)] mod tests { use std::path::Path; diff --git a/src/result.rs b/src/result.rs index 28a48bb..7db70b0 100644 --- a/src/result.rs +++ b/src/result.rs @@ -37,6 +37,10 @@ impl Result { functionCalls, } } + + pub fn add_region(&mut self) { + self.regions += 1 + } } // Implementation of the formating system in rust ensuring that the structure may be From 984cae6bb9618791613e1f4c3a5eaedb9427d0bf Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 15:07:38 -0400 Subject: [PATCH 07/17] Smarter handler for diff reader has been added This checks what line came before the decide what to do next. Work in progress as there are states that are as of yet unfiled. Work in progress as documentation is not added yet to the new struct --- src/diff.rs | 109 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 95 insertions(+), 14 deletions(-) diff --git a/src/diff.rs b/src/diff.rs index abcf209..3d321c2 100644 --- a/src/diff.rs +++ b/src/diff.rs @@ -26,20 +26,21 @@ pub fn diffStats(file: &Path) -> Option { return None; } }; - file_data - .lines() - .for_each(|line| { - match diff_type(line) { - DiffType::Header => {} - DiffType::Index => {} - DiffType::OriginalFile => {} - DiffType::NewFile => {} - DiffType::NewRegion => retVal.add_region(), - DiffType::FileLine => {} - DiffType::Addition => {} - DiffType::Subtraction => {} - }; - }); + let mut diff_format_typer = DiffFormatTyper::new(); + let lines = file_data.lines(); + for line in lines { + match diff_format_typer.type_line(line) { + None => break, + Some(DiffType::Header) => {} + Some(DiffType::Index) => {} + Some(DiffType::OriginalFile) => {} + Some(DiffType::NewFile) => {} + Some(DiffType::NewRegion) => retVal.add_region(), + Some(DiffType::FileLine) => {} + Some(DiffType::Addition) => {} + Some(DiffType::Subtraction) => {} + }; + } Some(retVal) } else { @@ -48,6 +49,7 @@ pub fn diffStats(file: &Path) -> Option { } +#[derive(Debug, Clone)] enum DiffType { Header, Index, @@ -93,6 +95,85 @@ fn diff_type(line: &str) -> DiffType { } } +struct DiffFormatTyper { + last: Option, +} + +impl DiffFormatTyper { + fn new() -> Self { + DiffFormatTyper { last: None } + } + + fn type_line(&mut self, line: &str) -> Option { + let diff_type = diff_type(line); + let diff_type_fixed = match self.last.clone() { + None => { + match diff_type { + DiffType::Header => Some(DiffType::Header), + _ => None, + } + } + Some(last_type) => { + match last_type { + DiffType::Header => { + match diff_type { + this @ DiffType::Index => Some(this), + _ => None, + } + } + DiffType::Index => { + match diff_type { + DiffType::Subtraction => { + if line.starts_with("---") { + Some(DiffType::OriginalFile) + } else { + None + } + } + _ => None, + } + } + DiffType::OriginalFile => { + match diff_type { + DiffType::Addition => { + if line.starts_with("+++") { + Some(DiffType::NewFile) + } else { + None + } + } + _ => None, + } + } + DiffType::NewFile => { + match diff_type { + DiffType::NewRegion => Some(DiffType::NewRegion), + _ => None, + } + } + DiffType::NewRegion | DiffType::FileLine | DiffType::Addition | + DiffType::Subtraction => { + match diff_type { + this @ DiffType::FileLine | + this @ DiffType::Addition | + this @ DiffType::Subtraction | + this @ DiffType::NewRegion | + this @ DiffType::Header => Some(this), + _ => None, + } + } + } + } + }; + if let None = diff_type_fixed { + eprintln!("Error: Diff file is not formated correctly. Line {} after {:?} was unexpected.", + line, + self.last); + } + self.last = diff_type_fixed; + self.last.clone() + } +} #[cfg(test)] From 820d563f5ab4990664070c4fb533d8fd3c7f3490 Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 15:12:46 -0400 Subject: [PATCH 08/17] Implemented line added and removed count --- src/diff.rs | 4 ++-- src/result.rs | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/diff.rs b/src/diff.rs index 3d321c2..13342d2 100644 --- a/src/diff.rs +++ b/src/diff.rs @@ -37,8 +37,8 @@ pub fn diffStats(file: &Path) -> Option { Some(DiffType::NewFile) => {} Some(DiffType::NewRegion) => retVal.add_region(), Some(DiffType::FileLine) => {} - Some(DiffType::Addition) => {} - Some(DiffType::Subtraction) => {} + Some(DiffType::Addition) => retVal.count_added_line(), + Some(DiffType::Subtraction) => retVal.count_removed_line(), }; } diff --git a/src/result.rs b/src/result.rs index 7db70b0..6d647e3 100644 --- a/src/result.rs +++ b/src/result.rs @@ -41,6 +41,14 @@ impl Result { pub fn add_region(&mut self) { self.regions += 1 } + + pub fn count_added_line(&mut self) { + self.lineAdded += 1 + } + + pub fn count_removed_line(&mut self) { + self.lineDeleted += 1 + } } // Implementation of the formating system in rust ensuring that the structure may be From 6b9e43e84ae7c79b02a6b259eb3bcb65b9f37e29 Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 15:19:06 -0400 Subject: [PATCH 09/17] Adding NewMode header line --- src/diff.rs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/diff.rs b/src/diff.rs index 13342d2..e8792a9 100644 --- a/src/diff.rs +++ b/src/diff.rs @@ -35,6 +35,7 @@ pub fn diffStats(file: &Path) -> Option { Some(DiffType::Index) => {} Some(DiffType::OriginalFile) => {} Some(DiffType::NewFile) => {} + Some(DiffType::NewMode) => {} Some(DiffType::NewRegion) => retVal.add_region(), Some(DiffType::FileLine) => {} Some(DiffType::Addition) => retVal.count_added_line(), @@ -55,6 +56,7 @@ enum DiffType { Index, OriginalFile, NewFile, + NewMode, NewRegion, FileLine, Addition, @@ -87,6 +89,7 @@ fn diff_type(line: &str) -> DiffType { match line.chars().next().unwrap_or(' ') { 'd' => DiffType::Header, 'i' => DiffType::Index, + 'n' => DiffType::NewMode, '@' => DiffType::NewRegion, '+' => DiffType::Addition, '-' => DiffType::Subtraction, @@ -117,7 +120,8 @@ impl DiffFormatTyper { match last_type { DiffType::Header => { match diff_type { - this @ DiffType::Index => Some(this), + this @ DiffType::Index | + this @ DiffType::NewMode => Some(this), _ => None, } } @@ -151,6 +155,12 @@ impl DiffFormatTyper { _ => None, } } + DiffType::NewMode => { + match diff_type { + this @ DiffType::Index => Some(this), + _ => None, + } + } DiffType::NewRegion | DiffType::FileLine | DiffType::Addition | DiffType::Subtraction => { match diff_type { From 6779e0ae643bdca91dd4ea66cbd77ccc06d91378 Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 15:28:46 -0400 Subject: [PATCH 10/17] Improving the state machine having to understand the diffs. Deleted files and empty file were supported here. --- src/diff.rs | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/src/diff.rs b/src/diff.rs index e8792a9..9186f24 100644 --- a/src/diff.rs +++ b/src/diff.rs @@ -31,15 +31,10 @@ pub fn diffStats(file: &Path) -> Option { for line in lines { match diff_format_typer.type_line(line) { None => break, - Some(DiffType::Header) => {} - Some(DiffType::Index) => {} - Some(DiffType::OriginalFile) => {} - Some(DiffType::NewFile) => {} - Some(DiffType::NewMode) => {} Some(DiffType::NewRegion) => retVal.add_region(), - Some(DiffType::FileLine) => {} Some(DiffType::Addition) => retVal.count_added_line(), Some(DiffType::Subtraction) => retVal.count_removed_line(), + _ => {} }; } @@ -58,6 +53,7 @@ enum DiffType { NewFile, NewMode, NewRegion, + FileDeleted, FileLine, Addition, Subtraction, @@ -111,7 +107,7 @@ impl DiffFormatTyper { let diff_type = diff_type(line); let diff_type_fixed = match self.last.clone() { None => { - match diff_type { + match diff_type { DiffType::Header => Some(DiffType::Header), _ => None, } @@ -119,14 +115,22 @@ impl DiffFormatTyper { Some(last_type) => { match last_type { DiffType::Header => { - match diff_type { + match diff_type { this @ DiffType::Index | this @ DiffType::NewMode => Some(this), + DiffType::Header => { + if line.starts_with("deleted") { + Some(DiffType::FileDeleted) + } else { + None + } + } _ => None, } } DiffType::Index => { - match diff_type { + match diff_type { + this @ DiffType::Header => Some(this), DiffType::Subtraction => { if line.starts_with("---") { Some(DiffType::OriginalFile) @@ -138,7 +142,7 @@ impl DiffFormatTyper { } } DiffType::OriginalFile => { - match diff_type { + match diff_type { DiffType::Addition => { if line.starts_with("+++") { Some(DiffType::NewFile) @@ -150,13 +154,19 @@ impl DiffFormatTyper { } } DiffType::NewFile => { - match diff_type { - DiffType::NewRegion => Some(DiffType::NewRegion), + match diff_type { + this @ DiffType::NewRegion => Some(this), _ => None, } } DiffType::NewMode => { - match diff_type { + match diff_type { + this @ DiffType::Index => Some(this), + _ => None, + } + } + DiffType::FileDeleted => { + match diff_type { this @ DiffType::Index => Some(this), _ => None, } From c499780262902acea5c2bf623f6d54b24eec10af Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 16:19:51 -0400 Subject: [PATCH 11/17] Refactor of src/diff.rs src/diff.rs can now be found in two parts: src/diff/mod.rs holds the front facing part of the module. src/diff/diff_type.rs holds the type parcing and state machine logic --- src/{diff.rs => diff/diff_type.rs} | 91 ++---------------------------- src/diff/mod.rs | 83 +++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 87 deletions(-) rename src/{diff.rs => diff/diff_type.rs} (63%) create mode 100644 src/diff/mod.rs diff --git a/src/diff.rs b/src/diff/diff_type.rs similarity index 63% rename from src/diff.rs rename to src/diff/diff_type.rs index 9186f24..d444579 100644 --- a/src/diff.rs +++ b/src/diff/diff_type.rs @@ -1,52 +1,5 @@ -use result::Result; // Result directly is our result. -use std::collections::{HashMap, HashSet}; -use std::fs::read; -use std::path::Path; - -/// Given a file, return statistical information about the file as if it were a diff file. -/// Returns Some Result if the file could be read and None if the reading failed. -/// -/// # Arguments -/// -/// `file` - The path to the file to be analized -/// -/// # Advice -/// -/// This will be the main function per file. -/// It is adviced to create a handler around this as this function is not taking exclusive -/// rights to the path. The path should be secured by the handler for the sake of returning -/// an error message if the path was invalid and no data could be read. -pub fn diffStats(file: &Path) -> Option { - if let Ok(file_data) = read(file) { - let mut retVal = Result::new(HashSet::new(), 0, 0, 0, HashMap::new()); - let file_data = match String::from_utf8(file_data) { - Ok(data) => data, - Err(_) => { - eprintln!("Error: Could not encode file as utf8"); - return None; - } - }; - let mut diff_format_typer = DiffFormatTyper::new(); - let lines = file_data.lines(); - for line in lines { - match diff_format_typer.type_line(line) { - None => break, - Some(DiffType::NewRegion) => retVal.add_region(), - Some(DiffType::Addition) => retVal.count_added_line(), - Some(DiffType::Subtraction) => retVal.count_removed_line(), - _ => {} - }; - } - - Some(retVal) - } else { - None // During file error, we simply return nothing to indicate that the file has no contents instead of valid contents with nothing in it. - } -} - - #[derive(Debug, Clone)] -enum DiffType { +pub enum DiffType { Header, Index, OriginalFile, @@ -94,16 +47,16 @@ fn diff_type(line: &str) -> DiffType { } } -struct DiffFormatTyper { +pub struct DiffFormatTyper { last: Option, } impl DiffFormatTyper { - fn new() -> Self { + pub fn new() -> Self { DiffFormatTyper { last: None } } - fn type_line(&mut self, line: &str) -> Option { + pub fn type_line(&mut self, line: &str) -> Option { let diff_type = diff_type(line); let diff_type_fixed = match self.last.clone() { None => { @@ -194,39 +147,3 @@ impl DiffFormatTyper { self.last.clone() } } - - -#[cfg(test)] -mod tests { - use std::path::Path; - - use result::Result; - use super::diffStats; - - #[test] - fn invaldFile() { - let path = Path::new("/INVALID"); - assert!(match diffStats(&path) { - None => true, - _ => false, - }); - } - - #[test] - fn emptyFile() { - use std::fs::{write, remove_file}; - let filename = "test.tmp"; // We create an empty file just to be sure it exists. - write(&filename, ""); - let path = Path::new(&filename); - assert!(match diffStats(&path) { - Some(_) => true, - _ => false, - }); - // TODO implementing PartialEq on result to improve this test to the next line. - // assert!(match diffStats(&path){ Some(data) => data == Result::empty(), _ => false}); - remove_file(&filename); // Cleanup that file so we don't keep creating different files. - - // TODO crate tempfile should be introduced in test setup to improve this test by - // removing the issue with creating and removing a file. - } -} diff --git a/src/diff/mod.rs b/src/diff/mod.rs new file mode 100644 index 0000000..0e51f5f --- /dev/null +++ b/src/diff/mod.rs @@ -0,0 +1,83 @@ +use result::Result; // Result directly is our result. +use std::collections::{HashMap, HashSet}; +use std::fs::read; +use std::path::Path; + +mod diff_type; +use self::diff_type::{DiffType, DiffFormatTyper}; + +/// Given a file, return statistical information about the file as if it were a diff file. +/// Returns Some Result if the file could be read and None if the reading failed. +/// +/// # Arguments +/// +/// `file` - The path to the file to be analized +/// +/// # Advice +/// +/// This will be the main function per file. +/// It is adviced to create a handler around this as this function is not taking exclusive +/// rights to the path. The path should be secured by the handler for the sake of returning +/// an error message if the path was invalid and no data could be read. +pub fn diffStats(file: &Path) -> Option { + if let Ok(file_data) = read(file) { + let mut retVal = Result::new(HashSet::new(), 0, 0, 0, HashMap::new()); + let file_data = match String::from_utf8(file_data) { + Ok(data) => data, + Err(_) => { + eprintln!("Error: Could not encode file as utf8"); + return None; + } + }; + let mut diff_format_typer = DiffFormatTyper::new(); + let lines = file_data.lines(); + for line in lines { + match diff_format_typer.type_line(line) { + None => break, + Some(DiffType::NewRegion) => retVal.add_region(), + Some(DiffType::Addition) => retVal.count_added_line(), + Some(DiffType::Subtraction) => retVal.count_removed_line(), + _ => {} + }; + } + + Some(retVal) + } else { + None // During file error, we simply return nothing to indicate that the file has no contents instead of valid contents with nothing in it. + } +} + +#[cfg(test)] +mod tests { + use std::path::Path; + + use result::Result; + use super::diffStats; + + #[test] + fn invaldFile() { + let path = Path::new("/INVALID"); + assert!(match diffStats(&path) { + None => true, + _ => false, + }); + } + + #[test] + fn emptyFile() { + use std::fs::{write, remove_file}; + let filename = "test.tmp"; // We create an empty file just to be sure it exists. + write(&filename, ""); + let path = Path::new(&filename); + assert!(match diffStats(&path) { + Some(_) => true, + _ => false, + }); + // TODO implementing PartialEq on result to improve this test to the next line. + // assert!(match diffStats(&path){ Some(data) => data == Result::empty(), _ => false}); + remove_file(&filename); // Cleanup that file so we don't keep creating different files. + + // TODO crate tempfile should be introduced in test setup to improve this test by + // removing the issue with creating and removing a file. + } +} From 235b4e88800a794a3b9d99e5da53c77f865e74ce Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 17:13:27 -0400 Subject: [PATCH 12/17] Adding filename parcing to diff header. Adding regex and lazy static to facilitate easy use of the parcing system for each line. The regex used could be replaced without effecting much of the outer system. Seeing as anywhere else, filenames are repeated, using the header seems like a good enough solution for the task that can be quickly exteneded. --- Cargo.lock | 92 ++++++++++++++++++++++++++++++++++++++++++ Cargo.toml | 2 + src/diff/diff_parse.rs | 9 +++++ src/diff/mod.rs | 8 ++++ src/main.rs | 3 ++ src/result.rs | 4 ++ 6 files changed, 118 insertions(+) create mode 100644 src/diff/diff_parse.rs diff --git a/Cargo.lock b/Cargo.lock index fbea8ec..fa15ea7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,4 +1,96 @@ +[[package]] +name = "aho-corasick" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "cfg-if" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "clever-challenge" version = "0.1.0" +dependencies = [ + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "lazy_static" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "libc" +version = "0.2.44" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "memchr" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)", + "version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "regex-syntax" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "thread_local" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "ucd-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "utf8-ranges" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "version_check" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +[metadata] +"checksum aho-corasick 0.6.9 (registry+https://github.com/rust-lang/crates.io-index)" = "1e9a933f4e58658d7b12defcf96dc5c720f20832deebe3e0a19efd3b6aaeeb9e" +"checksum cfg-if 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "082bb9b28e00d3c9d39cc03e64ce4cea0f1bb9b3fde493f0cbc008472d22bdf4" +"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1" +"checksum libc 0.2.44 (registry+https://github.com/rust-lang/crates.io-index)" = "10923947f84a519a45c8fefb7dd1b3e8c08747993381adee176d7a82b4195311" +"checksum memchr 2.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0a3eb002f0535929f1199681417029ebea04aadc0c7a4224b46be99c7f5d6a16" +"checksum regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "37e7cbbd370869ce2e8dff25c7018702d10b21a20ef7135316f8daecd6c25b7f" +"checksum regex-syntax 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "4e47a2ed29da7a9e1960e1639e7a982e6edc6d49be308a3b02daf511504a16d1" +"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" +"checksum ucd-util 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "535c204ee4d8434478593480b8f86ab45ec9aae0e83c568ca81abf0fd0e88f86" +"checksum utf8-ranges 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "796f7e48bef87609f7ade7e06495a87d5cd06c7866e6a5cbfceffc558a243737" +"checksum version_check 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "914b1a6776c4c929a602fafd8bc742e06365d4bcbe48c30f9cca5824f70dc9dd" diff --git a/Cargo.toml b/Cargo.toml index e1c01f3..5835908 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,3 +4,5 @@ version = "0.1.0" authors = ["Tormyst "] [dependencies] +regex = "1.1.0" +lazy_static = "1.2.0" diff --git a/src/diff/diff_parse.rs b/src/diff/diff_parse.rs new file mode 100644 index 0000000..92cac05 --- /dev/null +++ b/src/diff/diff_parse.rs @@ -0,0 +1,9 @@ +use regex::Regex; + +pub fn header_filenames(header: &str) -> (String, String){ + lazy_static! { + static ref HEADER_GIT_FILENAME: Regex = Regex::new(r"^diff --git a/([^ ]+) b/([^ ]+)$").unwrap(); + } + let groups = HEADER_GIT_FILENAME.captures(header).unwrap(); + (groups.get(1).unwrap().as_str().to_string(), groups.get(2).unwrap().as_str().to_string()) +} diff --git a/src/diff/mod.rs b/src/diff/mod.rs index 0e51f5f..990aedc 100644 --- a/src/diff/mod.rs +++ b/src/diff/mod.rs @@ -6,6 +6,9 @@ use std::path::Path; mod diff_type; use self::diff_type::{DiffType, DiffFormatTyper}; +mod diff_parse; +use self::diff_parse::header_filenames; + /// Given a file, return statistical information about the file as if it were a diff file. /// Returns Some Result if the file could be read and None if the reading failed. /// @@ -34,6 +37,11 @@ pub fn diffStats(file: &Path) -> Option { for line in lines { match diff_format_typer.type_line(line) { None => break, + Some(DiffType::Header) => { + let (file1, file2) = header_filenames(line); + retVal.add_filename(file1); + retVal.add_filename(file2); + } Some(DiffType::NewRegion) => retVal.add_region(), Some(DiffType::Addition) => retVal.count_added_line(), Some(DiffType::Subtraction) => retVal.count_removed_line(), diff --git a/src/main.rs b/src/main.rs index 7bf0632..2342b9f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,4 +1,7 @@ #![allow(non_snake_case)] +extern crate regex; +#[macro_use] extern crate lazy_static; + use std::time::Instant; use std::path::Path; diff --git a/src/result.rs b/src/result.rs index 6d647e3..50df3b2 100644 --- a/src/result.rs +++ b/src/result.rs @@ -38,6 +38,10 @@ impl Result { } } + pub fn add_filename(&mut self, filename: String) { + self.files.insert(filename); + } + pub fn add_region(&mut self) { self.regions += 1 } From ab40313c91b55c74bba4e499c7587869ee53b0d8 Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 19:16:24 -0400 Subject: [PATCH 13/17] Adding count of function calls The way this is done is throug regex of a single line to find A bunch of word characters followed by an open bracket. --- src/diff/diff_parse.rs | 13 ++++++++++++- src/diff/diff_type.rs | 10 ++++++++++ src/diff/mod.rs | 10 ++++++++-- src/result.rs | 5 +++++ 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/diff/diff_parse.rs b/src/diff/diff_parse.rs index 92cac05..8ee5ae3 100644 --- a/src/diff/diff_parse.rs +++ b/src/diff/diff_parse.rs @@ -1,9 +1,20 @@ +use result::Result; use regex::Regex; -pub fn header_filenames(header: &str) -> (String, String){ +pub fn header_filenames(header: &str) -> (String, String) { lazy_static! { static ref HEADER_GIT_FILENAME: Regex = Regex::new(r"^diff --git a/([^ ]+) b/([^ ]+)$").unwrap(); } let groups = HEADER_GIT_FILENAME.captures(header).unwrap(); (groups.get(1).unwrap().as_str().to_string(), groups.get(2).unwrap().as_str().to_string()) } + +pub fn find_functions(string: &str, result: &mut Result) { + lazy_static! { + static ref FUNCTION_REGEX: Regex = Regex::new(r"\w+\(").unwrap(); + } + FUNCTION_REGEX + .find_iter(string) + .map(|function| format!("{})", function.as_str())) + .for_each(|string| result.add_function_call(string)) +} diff --git a/src/diff/diff_type.rs b/src/diff/diff_type.rs index d444579..81d90e7 100644 --- a/src/diff/diff_type.rs +++ b/src/diff/diff_type.rs @@ -12,6 +12,16 @@ pub enum DiffType { Subtraction, } +impl DiffType { + pub fn is_file_body(&self) -> bool { + match self { + DiffType::FileLine | DiffType::Addition | DiffType::Subtraction | + DiffType::NewRegion => true, + _ => false, + } + } +} + /// Returns the diff status of a line. There are a few states, including the different header /// lines, normal file lines, additions, and subtractions. /// diff --git a/src/diff/mod.rs b/src/diff/mod.rs index 990aedc..dda5928 100644 --- a/src/diff/mod.rs +++ b/src/diff/mod.rs @@ -7,7 +7,7 @@ mod diff_type; use self::diff_type::{DiffType, DiffFormatTyper}; mod diff_parse; -use self::diff_parse::header_filenames; +use self::diff_parse::{find_functions, header_filenames}; /// Given a file, return statistical information about the file as if it were a diff file. /// Returns Some Result if the file could be read and None if the reading failed. @@ -35,7 +35,8 @@ pub fn diffStats(file: &Path) -> Option { let mut diff_format_typer = DiffFormatTyper::new(); let lines = file_data.lines(); for line in lines { - match diff_format_typer.type_line(line) { + let diff_type = diff_format_typer.type_line(line); + match diff_type { None => break, Some(DiffType::Header) => { let (file1, file2) = header_filenames(line); @@ -47,6 +48,11 @@ pub fn diffStats(file: &Path) -> Option { Some(DiffType::Subtraction) => retVal.count_removed_line(), _ => {} }; + if let Some(diff_type) = diff_type { + if diff_type.is_file_body() { + find_functions(line, &mut retVal); + } + } } Some(retVal) diff --git a/src/result.rs b/src/result.rs index 50df3b2..cca3721 100644 --- a/src/result.rs +++ b/src/result.rs @@ -42,6 +42,11 @@ impl Result { self.files.insert(filename); } + pub fn add_function_call(&mut self, function: String) { + let current = *self.functionCalls.get(&function).unwrap_or(&0); + self.functionCalls.insert(function,current + 1); + } + pub fn add_region(&mut self) { self.regions += 1 } From 20f83472b3ac4406b101f57d722cc8cf86ac3133 Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 19:18:24 -0400 Subject: [PATCH 14/17] Improvment of main function. Printing takes time. Printing a lot takes longer. This is a split of the print time from the compute time. --- src/main.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/main.rs b/src/main.rs index 2342b9f..f0a096a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -33,9 +33,11 @@ fn timeTrack(start: Instant, string: &'static str) { fn main() { let now = Instant::now(); - println!("{}", compute().unwrap()); - + let compute = compute().unwrap(); timeTrack(now, "compute diff"); + let now = Instant::now(); + println!("{}", compute); + timeTrack(now, "compute print"); } // compute parses the git diffs in ./diffs and returns From b0b53c7689ac4fe90bb910cc2ca0ddb50739e6e3 Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 19:46:52 -0400 Subject: [PATCH 15/17] Giving the documentation treetment to src/diff/diff_type.rs --- src/diff/diff_type.rs | 50 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/diff/diff_type.rs b/src/diff/diff_type.rs index 81d90e7..07f523a 100644 --- a/src/diff/diff_type.rs +++ b/src/diff/diff_type.rs @@ -1,18 +1,41 @@ +/// Represents a kind of diff line. +/// The diff is split into 2 different parts: +/// Header information +/// File information +/// The problem is that this information is not split clearly. The header is not constant and may +/// lack any part of the header or body. +/// +/// In this enum, all posible types of lines are sorted. This is important even if the line is not +/// needed now, because if data is needed from a line later, we have the infromation labeled. #[derive(Debug, Clone)] pub enum DiffType { + /// The begining of a diff of the form: 'diff --git a/path/to/file b/path/tofile' Header, + /// Index information about a file: 'index 0123456..789abcd 100644' Index, + /// The path to the original file, the first file in the header: '--- a/path/to/file' OriginalFile, + /// The path to the new file, the second file in the header: '+++ b/path/to/file' NewFile, + /// Optional header to indicate that a files mode differs changed: 'new file mode 100644' NewMode, + /// The start of a file region: '@@ -888,12 +1002,33 @@ part of the file here' NewRegion, + /// The indication that the file was deleted: 'deleted file mode 100644' FileDeleted, + /// A line of a file which is the same between the given files: ' always with a space' FileLine, + /// A line that exists in the new file, but not the original: '+the line of the file' Addition, + /// A line that exists in the original file, but not the new: '-the line of the file' Subtraction, } + impl DiffType { + /// Returns if a type is part of the file body, or has file body parts to it. + /// This is useful as the slowest part of the process would be parcing the entire file for + /// functions. pub fn is_file_body(&self) -> bool { match self { DiffType::FileLine | DiffType::Addition | DiffType::Subtraction | @@ -57,15 +80,40 @@ fn diff_type(line: &str) -> DiffType { } } +/// Stateful Diff reading to avoid confusion over header and body. +/// Place all stateful information here. +/// +/// TODO: If you would want to know how many functions came from a given file, +/// putting a current file calculation here would be how to do it. pub struct DiffFormatTyper { last: Option, } impl DiffFormatTyper { + /// A new reader that should be started at the top of the file, however, it can start at any + /// header line. pub fn new() -> Self { DiffFormatTyper { last: None } } + /// A state machine that uses the guess of diff_type to create the actual type of a line and to + /// fail when it is not sure. + /// + /// Many just check that this line is expected and return it. Some however take the guess, do + /// a string compare to the minimum degree required and return the updated value. + /// + /// # Returns + /// + /// A value of Some(data) is considered a correct responce from the function. + /// None is the universal error. We also reset the state machine on error, and will continue + /// to output correct responces at the next header. + /// + /// A line to STDERR is sent indicating the error based on the string given and the current + /// state of the machine. + /// + /// # Note + /// + /// This section is messy and should be replaced with macros. pub fn type_line(&mut self, line: &str) -> Option { let diff_type = diff_type(line); let diff_type_fixed = match self.last.clone() { @@ -149,7 +197,7 @@ impl DiffFormatTyper { } }; if let None = diff_type_fixed { - eprintln!("Error: Diff file is not formated correctly. Line {} after {:?} was unexpected.", + eprintln!("Error: Diff file is not formated correctly. Line \"{}\" after {:?} was unexpected.", line, self.last); } From 219b448d39b56dbd3f637214f72fe1406aa5516b Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 20:01:30 -0400 Subject: [PATCH 16/17] giving src/diff/diff_parse.rs the documentation treatment --- src/diff/diff_parse.rs | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/diff/diff_parse.rs b/src/diff/diff_parse.rs index 8ee5ae3..e11b21c 100644 --- a/src/diff/diff_parse.rs +++ b/src/diff/diff_parse.rs @@ -1,6 +1,12 @@ use result::Result; use regex::Regex; +/// Returns the two filenames present in the diff header. +/// We are assuming --git, and this will ensure it as any non diff header will crash. +/// This is important as otherwise, we would be storing each filename twice: once for a path and +/// once for b path. +/// +/// This implementation assumes a path is any collection of characters that is not a space. pub fn header_filenames(header: &str) -> (String, String) { lazy_static! { static ref HEADER_GIT_FILENAME: Regex = Regex::new(r"^diff --git a/([^ ]+) b/([^ ]+)$").unwrap(); @@ -9,6 +15,29 @@ pub fn header_filenames(header: &str) -> (String, String) { (groups.get(1).unwrap().as_str().to_string(), groups.get(2).unwrap().as_str().to_string()) } +/// Finds and adds all functions to a given result structure. +/// This is the slowest part of the diff stats process as we have to iterate through a utf8 string +/// doing constant comparisons. +/// For the purposes of this chalange, I am defining a function given the regex bellow. A number +/// of word characters followed imidiatly by an open bracket. The issue with this structure is +/// that it is unlikely to be correct. Languages like C can have as much whitespace as wanted +/// between the name of the function and the open bracket. Likewise, the string 'for(' should not +/// be a function identifier, however this regex would. +/// +/// All in all, this is the best that can be done in a short time as the alternetive would be to +/// actually understand whatever language we are sifting though and then find funtions as defined +/// by the language itself. +/// +/// # Why are we passing in a result structure? +/// +/// This function has the beautiful disadvantage of being a variable size responce. +/// We don't know how many bytes any return of find_functions will take as it may return 0 or 5. +/// We however could return a Results structure: Initializing say 10_000 result structures, with +/// 10_000 empty sets for filenames is fairly useless. +/// Returning HashSet: We could return just the part of the results we wanted, the hashset and +/// create a function to add them together, or to open the underlying structure of the result. +/// I went with passing the result structure in directly. This is something I feel needs to be +/// fixed, however many other options seem unoptimal. pub fn find_functions(string: &str, result: &mut Result) { lazy_static! { static ref FUNCTION_REGEX: Regex = Regex::new(r"\w+\(").unwrap(); From b02c014e953a9f95a8ece3280e97689c4a54a77c Mon Sep 17 00:00:00 2001 From: Tormyst Date: Wed, 5 Dec 2018 20:14:52 -0400 Subject: [PATCH 17/17] Updating readme to say a few more things about me :) --- README.md | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 72c91f4..f12fb50 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,29 @@ While I am new to rust, my major Rust side project [FeGaBo](github.com/tormyst/F Using rust has interesting benefits. While possible to make efficient solutions in other languages, the advantages that rust offers pushes a lot of work onto the compiler. Ensuring that a task can multithreaded is simple in rust as if the operation is not memory safe, Rust will not compile. Casts and mutability are brought forward ensuring only what needs to be done is. +While this solution did not use threads, they could be added relativly easaly to have each file be processed individualy. + ## Building this project -This is a `cargo` project running on nightly rust. +This is a `cargo` project tested on stable rust. + +Once everything installed, best done through rustup (which can be installed through most package managers), you can use `cargo run` to run the solution + +For speed, try compiling under release: `cargo run --release` + +## Anything else? + +A check through of A trainee applicant must: -Once everything installed, you can use `cargo run` to run the solution +- Be engaged in a computer science (or related) university program. (Compleated with masters) +- Be able to work in Canada legally. (Yes) +- Be willing to come to Montreal. (Already here) +- Be able to read, understand and implement scientific papers. (Did that during my masters) +- Know: + - versionning systems (git, perforce, ...) (I know both of those also svn and something called accurev) + - c/c++/csharp or java (c, c++,csharp and java) +- Know or be willing to learn: + - golang (It's next on my list, just wanted to get through this first) + - docker (I know a bit about how they work and have used it several times, but not in depth) + - sql (A few variants) + - angular (I am no designer, but I have made some fun things in angular, but not an expert)