diff --git a/README.md b/README.md
index f007552..cfdc1c1 100644
--- a/README.md
+++ b/README.md
@@ -34,16 +34,22 @@ Ew, HTML. Let's run that through some pup selectors:
$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] attr{href}'
```
+Even better, let's grab the titles too:
+
+```bash
+$ curl -s https://news.ycombinator.com/ | pup 'td.title a[href^=http] json{}'
+```
+
## Basic Usage
```bash
-$ cat index.html | pup [selectors and flags]
+$ cat index.html | pup [flags] [selectors] [optional display function]
```
or
```bash
-$ pup < index.html [selectors and flags]
+$ pup < index.html [flags] [selectors] [optional display function]
```
## Examples
@@ -185,7 +191,7 @@ You can mix and match selectors as you wish.
cat index.html | pup element#id[attribute=value]
```
-## Functions
+## Display Functions
Non-HTML selectors which effect the output type are implemented as functions
which can be provided as a final argument.
@@ -231,6 +237,85 @@ $ pup < robots.html a attr{href} | head
//en.wikivoyage.org/wiki/
```
+#### `json{}`
+
+Print HTML as JSON.
+
+```bash
+$ cat robots.html | pup div#p-namespaces a
+
+ Article
+
+
+ Talk
+
+```
+
+```bash
+$ cat robots.html | pup div#p-namespaces a json{}
+[
+ {
+ "attrs": {
+ "accesskey": "c",
+ "href": "/wiki/Robots_exclusion_standard",
+ "title": "View the content page [c]"
+ },
+ "tag": "a",
+ "text": "Article"
+ },
+ {
+ "attrs": {
+ "accesskey": "t",
+ "href": "/wiki/Talk:Robots_exclusion_standard",
+ "title": "Discussion about the content page [t]"
+ },
+ "tag": "a",
+ "text": "Talk"
+ }
+]
+```
+
+Use the `-i` / `--indent` flag to control the intent level.
+
+```bash
+$ cat robots.html | pup --indent 4 div#p-namespaces a json{}
+[
+ {
+ "attrs": {
+ "accesskey": "c",
+ "href": "/wiki/Robots_exclusion_standard",
+ "title": "View the content page [c]"
+ },
+ "tag": "a",
+ "text": "Article"
+ },
+ {
+ "attrs": {
+ "accesskey": "t",
+ "href": "/wiki/Talk:Robots_exclusion_standard",
+ "title": "Discussion about the content page [t]"
+ },
+ "tag": "a",
+ "text": "Talk"
+ }
+]
+```
+
+If the selectors only return one element the results will be printed as a JSON
+object, not a list.
+
+```bash
+$ cat robots.html | pup --indent 4 title json{}
+{
+ "tag": "title",
+ "text": "Robots exclusion standard - Wikipedia, the free encyclopedia"
+}
+```
+
+Because there is no universal standard for converting HTML/XML to JSON, a
+method has been chosen which hopefully fits. The goal is simply to get the
+output of pup into a more consumable format.
+
## Flags
```bash
@@ -243,6 +328,6 @@ $ pup < robots.html a attr{href} | head
--version display version
```
-## TODO:
+## TODO
-* Print as json function `json{}`
+Add more tests!
diff --git a/display.go b/display.go
new file mode 100644
index 0000000..0e0d80a
--- /dev/null
+++ b/display.go
@@ -0,0 +1,130 @@
+package main
+
+import (
+ "encoding/json"
+ "fmt"
+ "regexp"
+ "strings"
+
+ "code.google.com/p/go.net/html"
+)
+
+type Displayer interface {
+ Display(nodes []*html.Node)
+}
+
+type TextDisplayer struct {
+}
+
+func (t TextDisplayer) Display(nodes []*html.Node) {
+ for _, node := range nodes {
+ if node.Type == html.TextNode {
+ fmt.Println(node.Data)
+ }
+ children := []*html.Node{}
+ child := node.FirstChild
+ for child != nil {
+ children = append(children, child)
+ child = child.NextSibling
+ }
+ t.Display(children)
+ }
+}
+
+type AttrDisplayer struct {
+ Attr string
+}
+
+func (a AttrDisplayer) Display(nodes []*html.Node) {
+ for _, node := range nodes {
+ attributes := node.Attr
+ for _, attr := range attributes {
+ if attr.Key == a.Attr {
+ val := html.EscapeString(attr.Val)
+ fmt.Printf("%s\n", val)
+ }
+ }
+ }
+}
+
+type JSONDisplayer struct {
+}
+
+// returns a jsonifiable struct
+func jsonify(node *html.Node) map[string]interface{} {
+ vals := map[string]interface{}{}
+ if len(node.Attr) > 0 {
+ attrs := map[string]string{}
+ for _, attr := range node.Attr {
+ attrs[attr.Key] = html.EscapeString(attr.Val)
+ }
+ vals["attrs"] = attrs
+ }
+ vals["tag"] = node.DataAtom.String()
+ children := []interface{}{}
+ for child := node.FirstChild; child != nil; child = child.NextSibling {
+ switch child.Type {
+ case html.ElementNode:
+ children = append(children, jsonify(child))
+ case html.TextNode:
+ text := strings.TrimSpace(child.Data)
+ if text != "" {
+ // if there is already text we'll append it
+ currText, ok := vals["text"]
+ if ok {
+ text = fmt.Sprintf("%s %s", currText, text)
+ }
+ vals["text"] = text
+ }
+ }
+ }
+ return vals
+}
+
+func (j JSONDisplayer) Display(nodes []*html.Node) {
+ var data []byte
+ var err error
+ switch len(nodes) {
+ case 1:
+ jsonNode := jsonify(nodes[0])
+ data, err = json.MarshalIndent(&jsonNode, "", indentString)
+ default:
+ jsonNodes := []map[string]interface{}{}
+ for _, node := range nodes {
+ jsonNodes = append(jsonNodes, jsonify(node))
+ }
+ data, err = json.MarshalIndent(&jsonNodes, "", indentString)
+ }
+ if err != nil {
+ panic("Could not jsonify nodes")
+ }
+ fmt.Printf("%s\n", data)
+}
+
+var (
+ // Display function helpers
+ displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
+ textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
+ attrFuncMatcher = regexp.MustCompile(`^attr\{([^\}]*)\}$`)
+ jsonFuncMatcher = regexp.MustCompile(`^json\{([^\}]*)\}$`)
+)
+
+func NewDisplayFunc(text string) (Displayer, error) {
+ if !displayMatcher.MatchString(text) {
+ return nil, fmt.Errorf("Not a display function")
+ }
+ switch {
+ case textFuncMatcher.MatchString(text):
+ return TextDisplayer{}, nil
+ case attrFuncMatcher.MatchString(text):
+ matches := attrFuncMatcher.FindStringSubmatch(text)
+ if len(matches) != 2 {
+ return nil, fmt.Errorf("")
+ } else {
+ return AttrDisplayer{matches[1]}, nil
+ }
+ case jsonFuncMatcher.MatchString(text):
+ return JSONDisplayer{}, nil
+ }
+ return nil, fmt.Errorf("Not a display function")
+}
diff --git a/funcs/display.go b/funcs/display.go
deleted file mode 100644
index f168425..0000000
--- a/funcs/display.go
+++ /dev/null
@@ -1,70 +0,0 @@
-package funcs
-
-import (
- "code.google.com/p/go.net/html"
- "fmt"
- "regexp"
-)
-
-type Displayer interface {
- Display(nodes []*html.Node)
-}
-
-type TextDisplayer struct {
-}
-
-func (t TextDisplayer) Display(nodes []*html.Node) {
- for _, node := range nodes {
- if node.Type == html.TextNode {
- fmt.Println(node.Data)
- }
- children := []*html.Node{}
- child := node.FirstChild
- for child != nil {
- children = append(children, child)
- child = child.NextSibling
- }
- t.Display(children)
- }
-}
-
-type AttrDisplayer struct {
- Attr string
-}
-
-func (a AttrDisplayer) Display(nodes []*html.Node) {
- for _, node := range nodes {
- attributes := node.Attr
- for _, attr := range attributes {
- if attr.Key == a.Attr {
- val := html.EscapeString(attr.Val)
- fmt.Printf("%s\n", val)
- }
- }
- }
-}
-
-var (
- // Display function helpers
- displayMatcher *regexp.Regexp = regexp.MustCompile(`\{[^\}]*\}$`)
- textFuncMatcher = regexp.MustCompile(`^text\{\}$`)
- attrFuncMatcher = regexp.MustCompile(`^attr\{([^\}]*)\}$`)
-)
-
-func NewDisplayFunc(text string) (Displayer, error) {
- if !displayMatcher.MatchString(text) {
- return nil, fmt.Errorf("Not a display function")
- }
- switch {
- case textFuncMatcher.MatchString(text):
- return TextDisplayer{}, nil
- case attrFuncMatcher.MatchString(text):
- matches := attrFuncMatcher.FindStringSubmatch(text)
- if len(matches) != 2 {
- return nil, fmt.Errorf("")
- } else {
- return AttrDisplayer{matches[1]}, nil
- }
- }
- return nil, fmt.Errorf("Not a display function")
-}
diff --git a/main.go b/main.go
index c37365c..f9a4769 100644
--- a/main.go
+++ b/main.go
@@ -4,7 +4,6 @@ import (
"code.google.com/p/go.net/html"
"code.google.com/p/go.net/html/charset"
"fmt"
- "github.com/ericchiang/pup/funcs"
"github.com/ericchiang/pup/selector"
"io"
"os"
@@ -12,7 +11,7 @@ import (
"strings"
)
-const VERSION string = "0.3.0"
+const VERSION string = "0.3.1"
var (
// Flags
@@ -22,7 +21,7 @@ var (
maxPrintLevel int = -1
printNumber bool = false
printColor bool = false
- displayer funcs.Displayer = nil
+ displayer Displayer = nil
)
// Print to stderr and exit
@@ -177,7 +176,7 @@ func main() {
// if this is the last element, check for a function like
// text{} or attr{}
if i+1 == len(cmds) {
- d, err := funcs.NewDisplayFunc(cmd)
+ d, err := NewDisplayFunc(cmd)
if err == nil {
displayer = d
selectors = selectors[0 : len(cmds)-1]