-
Notifications
You must be signed in to change notification settings - Fork 53
/
Copy pathshtml.go
242 lines (225 loc) · 6.71 KB
/
shtml.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
// WRP TXT / Simple HTML Mode Routines
package main
// TODO:
// - add image processing times counter to the footer
// - img cache w/garbage collector / test back/button behavior in old browsers
// - add referer header
// - svg support
// - incorrect cert support in both markdown and image download
// - unify cdp and txt image handlers
// - use goroutiness to process images
// - get inner html from chromedp instead of html2markdown
//
// - BUG: DomainFromURL always prefixes with http instead of https
// reproduces on vsi vms docs
// - BUG: markdown table errors
// reproduces on hacker news
// - BUG: captcha errors using html to markdown, perhaps use cdp inner html + downloaded images
// reproduces on https://www.cnn.com/cnn-underscored/electronics
import (
"bytes"
"encoding/base64"
"errors"
"fmt"
"image"
"image/gif"
"image/jpeg"
"image/png"
"io"
"log"
"net/http"
"strconv"
"strings"
"sync"
"time"
h2m "github.com/JohannesKaufmann/html-to-markdown"
"github.com/JohannesKaufmann/html-to-markdown/plugin"
"github.com/lithammer/shortuuid/v4"
"github.com/nfnt/resize"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/extension"
"github.com/yuin/goldmark/parser"
"github.com/yuin/goldmark/text"
"github.com/yuin/goldmark/util"
"golang.org/x/image/webp"
)
var imgStor imageStore
const imgZpfx = "/imgz/"
func init() {
imgStor.img = make(map[string]imageContainer)
}
type imageContainer struct {
data []byte
url string
added time.Time
}
type imageStore struct {
img map[string]imageContainer
sync.Mutex
}
func (i *imageStore) add(id, url string, img []byte) {
i.Lock()
defer i.Unlock()
i.img[id] = imageContainer{data: img, url: url, added: time.Now()}
}
func (i *imageStore) get(id string) ([]byte, error) {
i.Lock()
defer i.Unlock()
img, ok := i.img[id]
if !ok {
return nil, errors.New("not found")
}
return img.data, nil
}
func (i *imageStore) del(id string) {
i.Lock()
defer i.Unlock()
delete(i.img, id)
}
func fetchImage(id, url, imgType string, maxSize, imgOpt int) (int, error) {
log.Printf("Downloading IMGZ URL=%q for ID=%q", url, id)
var in []byte
var err error
switch url[:4] {
case "http":
r, err := http.Get(url) // TODO: possibly set a header "referer" here
if err != nil {
return 0, fmt.Errorf("Error downloading %q: %v", url, err)
}
if r.StatusCode != http.StatusOK {
return 0, fmt.Errorf("Error %q HTTP Status Code: %v", url, r.StatusCode)
}
defer r.Body.Close()
in, err = io.ReadAll(r.Body)
if err != nil {
return 0, fmt.Errorf("Error reading %q: %v", url, err)
}
case "data":
idx := strings.Index(url, ",")
if idx < 1 {
return 0, fmt.Errorf("image is embeded but unable to find coma: %q", url)
}
in, err = base64.StdEncoding.DecodeString(url[idx+1:])
if err != nil {
return 0, fmt.Errorf("error decoding image from url embed: %q: %v", url, err)
}
}
out, err := smallImg(in, imgType, maxSize, imgOpt)
if err != nil {
return 0, fmt.Errorf("Error scaling down image: %v", err)
}
imgStor.add(id, url, out)
return len(out), nil
}
func smallImg(src []byte, imgType string, maxSize, imgOpt int) ([]byte, error) {
t := http.DetectContentType(src)
var err error
var img image.Image
switch t {
case "image/png":
img, err = png.Decode(bytes.NewReader(src))
case "image/gif":
img, err = gif.Decode(bytes.NewReader(src))
case "image/jpeg":
img, err = jpeg.Decode(bytes.NewReader(src))
case "image/webp":
img, err = webp.Decode(bytes.NewReader(src))
default: // TODO: also add svg
err = errors.New("unknown content type: " + t)
}
if err != nil {
return nil, fmt.Errorf("image decode problem: %v", err)
}
img = resize.Thumbnail(uint(maxSize), uint(maxSize), img, resize.NearestNeighbor)
var outBuf bytes.Buffer
switch imgType {
case "png":
err = png.Encode(&outBuf, img)
case "gif":
err = gif.Encode(&outBuf, gifPalette(img, int64(imgOpt)), &gif.Options{})
case "jpg":
err = jpeg.Encode(&outBuf, img, &jpeg.Options{Quality: imgOpt})
}
if err != nil {
return nil, fmt.Errorf("gif encode problem: %v", err)
}
return outBuf.Bytes(), nil
}
type astTransformer struct {
imgType string
maxSize int
imgOpt int
totSize int
}
func (t *astTransformer) Transform(node *ast.Document, reader text.Reader, pc parser.Context) {
ast.Walk(node, func(n ast.Node, entering bool) (ast.WalkStatus, error) {
if link, ok := n.(*ast.Link); ok && entering {
link.Destination = append([]byte("/?m=html&t="+t.imgType+"&s="+strconv.Itoa(t.maxSize)+"&url="), link.Destination...)
}
if img, ok := n.(*ast.Image); ok && entering {
seq := shortuuid.New() + "." + t.imgType
size, err := fetchImage(seq, string(img.Destination), t.imgType, t.maxSize, t.imgOpt) // TODO: use goroutines with waitgroup
if err != nil {
log.Print(err)
n.Parent().RemoveChildren(n)
return ast.WalkContinue, nil
}
img.Destination = []byte(imgZpfx + seq)
t.totSize += size
}
return ast.WalkContinue, nil
})
}
func (rq *wrpReq) captureMarkdown() {
log.Printf("Processing Markdown conversion request for %v", rq.url)
// TODO: bug - DomainFromURL always prefixes with http:// instead of https
// this causes issues on some websites, fix or write a smarter DomainFromURL
c := h2m.NewConverter(h2m.DomainFromURL(rq.url), true, nil)
c.Use(plugin.GitHubFlavored())
md, err := c.ConvertURL(rq.url) // We could also get inner html from chromedp
if err != nil {
http.Error(rq.w, err.Error(), http.StatusInternalServerError)
return
}
log.Printf("Got %v bytes md from %v", len(md), rq.url)
var imgOpt int
switch rq.imgType {
case "jpg":
imgOpt = int(rq.jQual)
case "gif":
imgOpt = int(rq.nColors)
}
t := &astTransformer{imgType: rq.imgType, maxSize: int(rq.maxSize), imgOpt: imgOpt}
gm := goldmark.New(
goldmark.WithExtensions(extension.GFM),
goldmark.WithParserOptions(parser.WithASTTransformers(util.Prioritized(t, 100))),
)
var ht bytes.Buffer
err = gm.Convert([]byte(md), &ht)
if err != nil {
http.Error(rq.w, err.Error(), http.StatusInternalServerError)
return
}
log.Printf("Rendered %v bytes html for %v", len(ht.String()), rq.url)
rq.printUI(uiParams{
text: string(asciify([]byte(ht.String()))),
bgColor: "#FFFFFF",
imgSize: fmt.Sprintf("%.0f KB", float32(t.totSize)/1024.0),
})
}
func imgServerTxt(w http.ResponseWriter, r *http.Request) {
log.Printf("%s IMGZ Request for %s", r.RemoteAddr, r.URL.Path)
id := strings.Replace(r.URL.Path, imgZpfx, "", 1)
img, err := imgStor.get(id)
if err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
log.Printf("%s IMGZ error for %s: %v", r.RemoteAddr, r.URL.Path, err)
return
}
imgStor.del(id)
w.Header().Set("Content-Type", http.DetectContentType(img))
w.Header().Set("Content-Length", strconv.Itoa(len(img)))
w.Write(img)
w.(http.Flusher).Flush()
}