例子中调用golang.org/x/net/html的部分api如下所示。html.Parse函数读入一组bytes.解析后,返回html.node类型的HTML页面树状结构根节点。HTML拥有很多类型的结点如text(文本),commnets(注释)类型,在下面的例子中,我们 只关注< name key='value' >形式的结点。
golang.org/x/net/html
packagehtmltypeNodestruct { Type NodeType Data string Attr []Attribute FirstChild, NextSibling *Node}typeNodeTypeint32const ( ErrorNode NodeType=iota TextNode DocumentNode ElementNode CommentNode DoctypeNode)typeAttributestruct { Key, Val string}funcParse(r io.Reader) (*Node, error)
// Findlinks1 prints the links in an HTML document read from standard input.packagemainimport ("fmt""os""golang.org/x/net/html")funcmain() { doc, err := html.Parse(os.Stdin)if err !=nil { fmt.Fprintf(os.Stderr, "findlinks1: %v\n", err) os.Exit(1) }for _, link :=rangevisit(nil, doc) { fmt.Println(link) }}
// visit appends to links each link found in n and returns the result.funcvisit(links []string, n *html.Node) []string {if n.Type == html.ElementNode && n.Data =="a" {for _, a :=range n.Attr {if a.Key =="href" { links =append(links, a.Val) } } }for c := n.FirstChild; c !=nil; c = c.NextSibling { links =visit(links, c) }return links}
$ go build gopl.io/ch5/outline
$ ./fetch https://golang.org | ./outline
[html]
[html head]
[html head meta]
[html head title]
[html head link]
[html body]
[html body div]
[html body div]
[html body div div]
[html body div div form]
[html body div div form div]
[html body div div form div a]
...