2019-10-03 23:59:20 +00:00
|
|
|
package linkinfo
|
2019-10-03 23:44:38 +00:00
|
|
|
|
|
|
|
import (
|
|
|
|
"errors"
|
|
|
|
"fmt"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
|
"io"
|
|
|
|
"io/ioutil"
|
2019-10-16 23:12:29 +00:00
|
|
|
"net"
|
2019-10-03 23:44:38 +00:00
|
|
|
"net/http"
|
|
|
|
"net/url"
|
|
|
|
"path"
|
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
2019-10-04 00:03:11 +00:00
|
|
|
contentTypeHtml = "text/html"
|
2019-10-04 00:05:25 +00:00
|
|
|
|
|
|
|
maxBodySizeBytes = 20971520
|
2019-10-03 23:44:38 +00:00
|
|
|
)
|
|
|
|
|
2019-10-16 23:12:29 +00:00
|
|
|
var privateIPBlocks []*net.IPNet
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
for _, cidr := range []string{
|
|
|
|
"127.0.0.0/8", // IPv4 loopback
|
|
|
|
"10.0.0.0/8", // RFC1918
|
|
|
|
"172.16.0.0/12", // RFC1918
|
|
|
|
"192.168.0.0/16", // RFC1918
|
|
|
|
"::1/128", // IPv6 loopback
|
|
|
|
"fe80::/10", // IPv6 link-local
|
|
|
|
"fc00::/7", // IPv6 unique local addr
|
|
|
|
} {
|
|
|
|
_, block, err := net.ParseCIDR(cidr)
|
|
|
|
if err != nil {
|
|
|
|
panic(fmt.Errorf("parse error on %q: %v", cidr, err))
|
|
|
|
}
|
|
|
|
privateIPBlocks = append(privateIPBlocks, block)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func isPrivateIP(ip net.IP) bool {
|
|
|
|
for _, block := range privateIPBlocks {
|
|
|
|
if block.Contains(ip) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2019-10-04 01:25:57 +00:00
|
|
|
func (api *LinkInfoApi) DefaultLinkHandler(link string) (*LinkInfo, error) {
|
2019-10-03 23:44:38 +00:00
|
|
|
redirects := make([]string, 0)
|
|
|
|
|
|
|
|
u, err := url.Parse(link)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2019-10-16 23:12:29 +00:00
|
|
|
if u.Host == "localhost" {
|
|
|
|
return nil, errors.New("url is localhost")
|
|
|
|
} else if ip := net.ParseIP(u.Host); ip != nil && isPrivateIP(ip) {
|
|
|
|
return nil, errors.New("url is a local ip address")
|
|
|
|
}
|
|
|
|
|
2019-10-03 23:44:38 +00:00
|
|
|
var res *http.Response
|
|
|
|
|
|
|
|
for i := 0; i < 10; i++ {
|
2019-10-04 01:25:57 +00:00
|
|
|
res, err = api.Client.Head(link)
|
2019-10-03 23:44:38 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if (res.StatusCode == 301 || res.StatusCode == 302) && res.Header.Get("Location") != "" {
|
|
|
|
link = res.Header.Get("Location")
|
2019-10-16 23:12:29 +00:00
|
|
|
|
|
|
|
redirectUrl, err := url.Parse(link)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if redirectUrl.Host == "localhost" {
|
|
|
|
return nil, errors.New("url attempted to redirect to localhost")
|
|
|
|
} else if ip := net.ParseIP(redirectUrl.Host); ip != nil && isPrivateIP(ip) {
|
|
|
|
return nil, errors.New("host is a local ip address")
|
|
|
|
}
|
|
|
|
|
2019-10-03 23:44:38 +00:00
|
|
|
redirects = append(redirects, link)
|
|
|
|
} else {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if res != nil && res.StatusCode != 200 {
|
|
|
|
return nil, errors.New("invalid response, expected 200, got " + strconv.Itoa(res.StatusCode))
|
|
|
|
}
|
|
|
|
|
|
|
|
contentType := res.Header.Get("Content-Type")
|
|
|
|
|
|
|
|
if contentType == "" {
|
2019-10-04 01:25:57 +00:00
|
|
|
contentType = api.detectContentType(link, "application/octet-stream")
|
2019-10-03 23:44:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if idx := strings.Index(contentType, ";"); idx != -1 {
|
|
|
|
contentType = contentType[:idx]
|
|
|
|
}
|
|
|
|
|
|
|
|
var contentLength int64
|
|
|
|
|
|
|
|
if contentLengthStr := res.Header.Get("Content-Length"); contentLengthStr != "" {
|
|
|
|
contentLength, err = strconv.ParseInt(contentLengthStr, 10, 64)
|
|
|
|
}
|
|
|
|
|
|
|
|
ret := &LinkInfo{
|
2019-10-03 23:59:20 +00:00
|
|
|
ContentType: contentType,
|
2019-10-03 23:44:38 +00:00
|
|
|
ContentLength: contentLength,
|
|
|
|
}
|
|
|
|
|
|
|
|
switch contentType {
|
2019-10-04 00:03:11 +00:00
|
|
|
case contentTypeHtml:
|
2019-10-06 04:52:26 +00:00
|
|
|
if contentLength >= 0 && contentLength < maxBodySizeBytes {
|
2019-10-04 01:25:57 +00:00
|
|
|
err = api.retrieveHtmlLinkTitle(ret, link)
|
2019-10-04 00:05:25 +00:00
|
|
|
break
|
|
|
|
}
|
|
|
|
fallthrough
|
2019-10-03 23:44:38 +00:00
|
|
|
default:
|
|
|
|
ret.Title = fmt.Sprintf("%s (%s, %s)", path.Base(u.Path), contentType, ByteCountDecimal(contentLength))
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret, err
|
|
|
|
}
|
|
|
|
|
2019-10-04 01:25:57 +00:00
|
|
|
func (api *LinkInfoApi) detectContentType(link, defaultType string) string {
|
2019-10-03 23:44:38 +00:00
|
|
|
req, err := http.NewRequest("GET", link, nil)
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return defaultType
|
|
|
|
}
|
|
|
|
|
|
|
|
req.Header.Set("Range", "bytes=0-512")
|
|
|
|
|
2019-10-04 01:25:57 +00:00
|
|
|
res, err := api.Client.Do(req)
|
2019-10-03 23:44:38 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return defaultType
|
|
|
|
}
|
|
|
|
|
|
|
|
defer res.Body.Close()
|
|
|
|
|
|
|
|
b, err := ioutil.ReadAll(io.LimitReader(res.Body, 512))
|
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return defaultType
|
|
|
|
}
|
|
|
|
|
|
|
|
t := http.DetectContentType(b)
|
|
|
|
|
|
|
|
if t == "" {
|
|
|
|
t = defaultType
|
|
|
|
}
|
|
|
|
|
|
|
|
return t
|
|
|
|
}
|
|
|
|
|
|
|
|
var (
|
|
|
|
attrKeys = []string{"property", "name", "itemprop"}
|
|
|
|
)
|
|
|
|
|
2019-10-04 01:25:57 +00:00
|
|
|
func (api *LinkInfoApi) retrieveHtmlLinkTitle(i *LinkInfo, link string) error {
|
|
|
|
res, err := api.Client.Get(link)
|
2019-10-03 23:44:38 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
defer res.Body.Close()
|
|
|
|
|
2019-10-04 00:05:25 +00:00
|
|
|
q, err := goquery.NewDocumentFromReader(io.LimitReader(res.Body, maxBodySizeBytes))
|
2019-10-03 23:44:38 +00:00
|
|
|
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
meta := q.Find("meta")
|
|
|
|
|
|
|
|
metaTags := make(map[string]string)
|
|
|
|
|
|
|
|
meta.Each(func(_ int, s *goquery.Selection) {
|
2019-10-16 23:12:29 +00:00
|
|
|
var key, value string
|
2019-10-03 23:44:38 +00:00
|
|
|
var exists bool
|
|
|
|
|
|
|
|
for _, k := range attrKeys {
|
|
|
|
key, exists = s.Attr(k)
|
|
|
|
|
|
|
|
if exists {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if key == "" {
|
|
|
|
return
|
|
|
|
}
|
2019-10-16 23:12:29 +00:00
|
|
|
|
|
|
|
value, exists = s.Attr("content")
|
|
|
|
|
|
|
|
if !exists {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
metaTags[key] = value
|
2019-10-03 23:44:38 +00:00
|
|
|
})
|
|
|
|
|
|
|
|
var attr string
|
|
|
|
var exists bool
|
|
|
|
|
|
|
|
if attr, exists = metaTags["og:title"]; exists {
|
|
|
|
i.Title = attr
|
2019-10-16 23:12:29 +00:00
|
|
|
} else if tag := q.Find("head > title"); tag.Length() > 0 {
|
2019-10-04 00:02:33 +00:00
|
|
|
i.Title = strings.TrimSpace(tag.Text())
|
2019-10-03 23:44:38 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if attr, exists = metaTags["og:description"]; exists {
|
|
|
|
i.Description = attr
|
|
|
|
} else if attr, exists = metaTags["description"]; exists {
|
|
|
|
i.Description = attr
|
|
|
|
}
|
|
|
|
|
|
|
|
if attr, exists = metaTags["duration"]; exists {
|
|
|
|
i.Duration = attr
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
2019-10-03 23:59:20 +00:00
|
|
|
}
|