Skip to content

Commit

Permalink
Merge pull request #6 from rmg/skip-all-the-things
Browse files Browse the repository at this point in the history
Add basic skip optimization to all versions
  • Loading branch information
rmg authored Jun 30, 2023
2 parents adb7dfc + 1298bf8 commit 3c0cbdc
Show file tree
Hide file tree
Showing 9 changed files with 161 additions and 110 deletions.
65 changes: 31 additions & 34 deletions .github/workflows/makefile.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,40 +20,37 @@ jobs:
run: make raw.tar
- name: build & install
run: |
sudo apt-get update
sudo apt-get install ripgrep
sudo apt-get update -q
sudo apt-get install -yq ripgrep
make scan-c scan-go scan-rs
{
printf "## Versions\n"
grep -V
rg -V
node --version
rustc --version
go version
cc --version
} >> "$GITHUB_STEP_SUMMARY"
- name: make c.txt
run: |
printf '### c\n' >> "$GITHUB_STEP_SUMMARY"
make c.txt TIME_REPORT=$GITHUB_STEP_SUMMARY
- name: make go.txt
run: |
printf '### go\n' >> "$GITHUB_STEP_SUMMARY"
make go.txt TIME_REPORT=$GITHUB_STEP_SUMMARY
- name: make rs.txt
run: |
printf '### rs\n' >> "$GITHUB_STEP_SUMMARY"
make rs.txt TIME_REPORT=$GITHUB_STEP_SUMMARY
- name: make grep.txt
- run: make c.txt
- run: make go.txt
- run: make rs.txt
- run: make grep.txt
if: github.event_name != 'pull_request'
- run: make ripgrep.txt
- run: make js.txt
- name: summarize
run: |
printf '### grep\n' >> "$GITHUB_STEP_SUMMARY"
make grep.txt TIME_REPORT=$GITHUB_STEP_SUMMARY
- name: make ripgrep.txt
run: |
printf '### ripgrep\n' >> "$GITHUB_STEP_SUMMARY"
make ripgrep.txt TIME_REPORT=$GITHUB_STEP_SUMMARY
- name: make js.txt
run: |
printf '### js\n' >> "$GITHUB_STEP_SUMMARY"
make js.txt TIME_REPORT=$GITHUB_STEP_SUMMARY
make times.md
{
printf "## Best Times\n\n"
cat times.md
printf "\n## Versions\n"
printf "grep: "
grep -V | head -n 1
printf "ripgrep: "
rg -V | head -n 1
printf "node: "
node --version | head -n 1
printf "rustc: "
rustc --version | head -n 1
printf "go: "
go version | head -n 1
printf "cc: "
cc --version | head -n 1
printf "\n## Sample Data \n```"
ls -l raw.tar
printf "```\n"
} >> "$GITHUB_STEP_SUMMARY"
15 changes: 10 additions & 5 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,7 @@ CFLAGS := -Werror -Wall
FAST_CFLAGS := $(CFLAGS) -O3 -DNDEBUG
DEBUG_CFLAGS := $(CFLAGS) -g
SIMD_CFLAGS := $(CFLAGS) -DUSE_SIMD=1
TIME_CMD := command time -p

ifdef TIME_REPORT
TIME_CMD := $(TIME_CMD) -a -o $(TIME_REPORT)
endif
TIME_CMD = command time -p -a -o $@.times

hexgrep scan-c-fast: main.c
$(CC) $(FAST_CFLAGS) -o $@ $<
Expand Down Expand Up @@ -73,6 +69,15 @@ scan-go: main.go
scan-rs: main.rs
rustc -O -o $@ $<

times.md:
{ \
printf '\n| | real | user | system |\n'; \
printf '|-----|-------|-------|--------|\n'; \
for t in *.txt.times; do \
printf '| %3s | %5s | %5s | %6s |\n' $$(basename -s .txt.times $$t) $$(awk -f times.awk $$t); \
done; \
} > $@

%.txt: scan-% $(SAMPLE)
$(TIME_CMD) ./scan-$* < $(SAMPLE) > $@
$(TIME_CMD) ./scan-$* < $(SAMPLE) > $@
Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,11 @@ implementations compare to each other.
| grep | 0m18.034s | 0m15.713s | 0m2.257s |
| ripgrep | 0m1.709s | 0m1.541s | 0m0.147s |
| simple (Go) | 0m1.737s | 0m1.594s | 0m0.142s |
| skip (Go) | 0m0.338s | 0m0.187s | 0m0.152s |
| simple (Rust) | 0m1.461s | 0m1.325s | 0m0.131s |
| skip (Rust) | 0m0.231s | 0m0.105s | 0m0.124s |
| simple (Node) | 0m6.458s | 0m6.043s | 0m0.627s |
| skip (Node) | 0m1.368s | 0m1.062s | 0m0.686s |
| custom (C) | **0m0.222s** | **0m0.079s** | **0m0.141s** |

By comparing the times you can see that each implementation is more or less
Expand Down
3 changes: 3 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
module main

go 1.20
3 changes: 2 additions & 1 deletion main.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

//go:build ignore
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
Expand Down Expand Up @@ -226,7 +227,7 @@ static const unsigned char * scan_hit_short(const unsigned char *buf, const unsi
// Rather than checking them in linear order, we use statistics to determine
// the optimal order to check for an early exit.
// TODO: accept this ordering as input
// TOOD: extra credit, generate counts from first block
// TODO: extra credit, generate counts from first block
const int checks[39] = {
5,
32,
Expand Down
42 changes: 10 additions & 32 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,24 @@ limitations under the License.
package main

import (
"fmt"
"io"
"log"
"os"
"runtime"
"runtime/pprof"
)

const BUF = 64 * 4096

func searchWrite(buf []byte, out io.Writer) int {
count := 0
for i, b := range buf {
bl := len(buf)
for i := 0; i < bl; i++ {
b := buf[i]
if count == 0 && i+20 < bl {
bs := buf[i+20]
if !(bs >= '0' && bs <= '9') && !(bs >= 'a' && bs <= 'f') {
i += 20
continue
}
}
switch {
case b >= '0' && b <= '9':
fallthrough
Expand Down Expand Up @@ -77,30 +82,3 @@ func scan(input io.Reader) {
func main() {
scan(os.Stdin)
}

func pmain() {
c, err := os.Create("sha1scan.cpu.prof")
if err != nil {
log.Fatal("could not create CPU profile: ", err)
}
defer c.Close()
if err := pprof.StartCPUProfile(c); err != nil {
log.Fatal("could not start CPU profile: ", err)
}

scan(os.Stdin)

pprof.StopCPUProfile()
runtime.GC() // get up-to-date statistics

for _, p := range pprof.Profiles() {
m, err := os.Create(fmt.Sprintf("sha1scan.%s.prof", p.Name()))
if err != nil {
log.Fatalf("could not create %s profile: ", p.Name(), err)
}
defer m.Close()
if err := p.WriteTo(m, 1); err != nil {
log.Fatal("could not write %s profile: ", p.Name(), err)
}
}
}
4 changes: 4 additions & 0 deletions main.js
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ function is_hex(c) {
function scan_slice(buf) {
let count = 0;
for (let i = 0; i< buf.length; i++) {
if (count === 0 && i+20 < buf.length && !is_hex(buf[i+20])) {
i += 20;
continue;
}
if (is_hex(buf[i])) {
count++;
continue;
Expand Down
97 changes: 59 additions & 38 deletions main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,53 +14,74 @@ See the License for the specific language governing permissions and
limitations under the License.
*/

use std::io::{Read, Write,stdin, stdout};
use std::str;
use std::io::{stdin, stdout, Read, Write};

fn hit(needle: &[u8]) {
stdout().write(&needle).ok();
stdout().write(&[b'\n']).ok();
let mut out = stdout().lock();
out.write_all(needle).unwrap();
out.write_all(&[b'\n']).unwrap();
}

fn scan_slice(inb: &[u8]) -> usize {
let mut count = 0;
let len = inb.len();
for (i, &b) in inb.into_iter().enumerate() {
if b.is_ascii_digit() || (b'a'..=b'f').contains(&b) {
count += 1;
continue
}
if count == 40 {
hit(&inb[i-40..i]);
}
count = 0
}
if count == 40 {
hit(&inb[len-40..]);
count = 0
}
if count > 40 { 41 } else { count }
let mut count = 0;
let len = inb.len();
let mut i = 0usize;
while i < len {
let b = inb[i];
if count == 0 && i + 20 < len {
let bs = inb[i + 20];
if !bs.is_ascii_digit() && !(b'a'..=b'f').contains(&bs) {
i += 20;
continue;
}
}
if b.is_ascii_digit() || (b'a'..=b'f').contains(&b) {
count += 1;
i += 1;
continue;
}
if count == 40 {
hit(&inb[i - 40..i]);
}
count = 0;
i += 1;
}
if count == 40 {
hit(&inb[len - 40..]);
count = 0
}
if count > 40 {
41
} else {
count
}
}

fn sscan(mut input: impl Read) {
let mut backbuf = vec![0u8; 64*4096];
let bbuf = backbuf.as_mut_slice();
// let mut bbuf = [0u8; 2*1024*1024];
let mut off = 0;
let mut total_read = 0;
while let Ok(n) = input.read(&mut bbuf[off..]) {
total_read += n;
if n == 0 {
break
}
off = scan_slice(&bbuf[..n]);
for i in 0..off {
bbuf[i] = bbuf[n-off+i];
}
}
eprintln!("Total bytes read: {}", total_read);
let mut backbuf = vec![0u8; 64 * 4096];
let bbuf = backbuf.as_mut_slice();
// let mut bbuf = [0u8; 2*1024*1024];
let mut off = 0;
let mut total_read = 0;
while let Ok(n) = input.read(&mut bbuf[off..]) {
total_read += n;
if n == 0 {
break;
}
off = scan_slice(&bbuf[..n]);
for i in 0..off {
bbuf[i] = bbuf[n - off + i];
}
}
eprintln!("Total bytes read: {}", total_read);
}

fn main() {
sscan(stdin().lock())
let args: Vec<String> = std::env::args().collect();
if args.len() == 2 {
let file = std::fs::File::open(&args[1]).unwrap();
sscan(file)
} else {
sscan(stdin().lock())
}
}
39 changes: 39 additions & 0 deletions times.awk
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Takes input with multiple run times (in POSIX time(1) format) and
# reduces them to a single line of the real, user and system times for
# fastest overall run.
#
# Example input:
# real 0.37
# user 0.06
# sys 0.15
# real 0.18
# user 0.05
# sys 0.12
# real 0.19
# user 0.05
# sys 0.13
#
# Output:
# 0.18 0.05 0.12

BEGIN {
REAL_T = ""
USER_T = ""
SYS_T = ""
}
/real/ {
if (REAL_T == "" || REAL_T > $2) {
REAL_T = $2;
USER_T = "";
SYS_T = "";
}
}
/user/ {
if (USER_T == "") USER_T = $2
}
/sys/ {
if (SYS_T == "") SYS_T = $2
}
END {
print REAL_T, USER_T, SYS_T
}

0 comments on commit 3c0cbdc

Please sign in to comment.