Skip to content

Commit

Permalink
wl: add min word length, min count
Browse files Browse the repository at this point in the history
  • Loading branch information
davemolk committed Oct 24, 2022
1 parent bdfd0ef commit 0505eb8
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 13 deletions.
51 changes: 51 additions & 0 deletions wl/data.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
package main

import (
"fmt"
"log"
"os"
"sort"
"sync"
)

type WordMap struct {
mu sync.Mutex
words map[string]int
}

func newWordMap() *WordMap {
return &WordMap{
words: make(map[string]int),
}
}

func (wm *WordMap) add(w string) {
wm.mu.Lock()
defer wm.mu.Unlock()
wm.words[w]++
}

func (wm *WordMap) sort() []string {
keys := make([]string, 0, len(wm.words))
for key := range wm.words {
keys = append(keys, key)
}

sort.SliceStable(keys, func(i, j int) bool {
return wm.words[keys[i]] > wm.words[keys[j]]
})

return keys
}

func (wm *WordMap) write(keys []string, name string) {
f, err := os.Create(name)
if err != nil {
log.Fatal(err)
}
defer f.Close()

for _, key := range keys {
fmt.Fprintf(f, "%s: %d\n", key, wm.words[key])
}
}
28 changes: 21 additions & 7 deletions wl/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,44 @@ package main

import (
"flag"
"fmt"
"log"
"regexp"
"sync"
)

type config struct {
timeout int
url string
minCount int
minLength int
timeout int
url string
}

type wl struct {
config
noBlank *regexp.Regexp
wordMap *WordMap
}

func main() {
var config config
flag.IntVar(&config.minCount, "c", 0, "minimum count to include word in results")
flag.IntVar(&config.minLength, "len", 0, "minimum word length to consider")
flag.IntVar(&config.timeout, "t", 5000, "request timeout (in ms)")
flag.StringVar(&config.url, "u", "", "url to search")
flag.Parse()

noBlank := regexp.MustCompile(`\s{2,}`)
wordMap := newWordMap()

w := &wl{
config: config,
config: config,
noBlank: noBlank,
wordMap: wordMap,
}

g, err := w.makeRequest(config.url, config.timeout)
if err != nil {
log.Fatal(err)
log.Fatal(err)
}

words := w.processData(g)
Expand All @@ -42,8 +49,15 @@ func main() {
go func(word string) {
defer wg.Done()
word = w.removePunctuation(word)
fmt.Println(word)
if len(word) >= config.minLength {
w.wordMap.add(word)
}
}(word)
}
wg.Wait()
}

keys := w.wordMap.sort()
keysCount := w.dropLowCount(keys)

w.wordMap.write(keysCount, "wl/results.txt")
}
19 changes: 16 additions & 3 deletions wl/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ func (w *wl) processData(doc *goquery.Document) []string {
body := doc.Text()
body = w.noBlank.ReplaceAllString(body, " ")
body = strings.Replace(body, "\n", "", -1)

return strings.Split(body, " ")
}

Expand All @@ -24,7 +24,7 @@ func (w *wl) removePunctuation(word string) string {
return strings.ToLower(word)
}

// prob add more
// add more?
func (w *wl) getPunctuation() []string {
return []string{
",",
Expand All @@ -33,5 +33,18 @@ func (w *wl) getPunctuation() []string {
";",
"!",
"?",
"—",
}
}
}

func (w *wl) dropLowCount(keys []string) []string {
if w.config.minCount > 0 {
for i, key := range keys {
if w.wordMap.words[key] < w.config.minCount {
keys = keys[:i]
break
}
}
}
return keys
}
6 changes: 3 additions & 3 deletions wl/requests.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import (
)

func (w *wl) makeRequest(url string, timeout int) (*goquery.Document, error) {
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(5000) * time.Millisecond)
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(5000)*time.Millisecond)
defer cancel()

req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
Expand All @@ -22,7 +22,7 @@ func (w *wl) makeRequest(url string, timeout int) (*goquery.Document, error) {
return nil, err
}
defer resp.Body.Close()

if resp.StatusCode != 200 {
return nil, fmt.Errorf("statusCode: %d", resp.StatusCode)
}
Expand All @@ -33,4 +33,4 @@ func (w *wl) makeRequest(url string, timeout int) (*goquery.Document, error) {
}

return doc, nil
}
}

0 comments on commit 0505eb8

Please sign in to comment.