From 7d96559965a0c3c92ffa7bfde46751f86ba21f5c Mon Sep 17 00:00:00 2001 From: Morten Piibeleht Date: Wed, 4 Sep 2024 11:50:51 +1200 Subject: [PATCH] Fix linkcheck_useragent implementation and tests (#2571) * Changes the `linkcheck_useragent` value that unsets the customization completely to `nothing`. * Fixes an error where linkcheck would throw when `linkcheck_useragent = ""` because the `if` block returns a `nothing` then. * Apparently, Intel already blocked the new user agent or something. So reorganizing the tests around the fact that they want the `curl` user agent. --- CHANGELOG.md | 4 ++- src/docchecks.jl | 54 +++++++++++++++++++++------------------- src/documents.jl | 4 +-- src/makedocs.jl | 8 ++++-- test/online_linkcheck.jl | 22 +++++++++++++++- 5 files changed, 60 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae31e39e12..cd6cfeb2f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -* The `User-Agent` header set in the linkcheck HTTP(S) requests can now be customized with the `linkcheck_useragent` option to `makedocs`. ([#2557], [#2562]) +* The `User-Agent` header set in the linkcheck HTTP(S) requests can now be customized with the `linkcheck_useragent` option to `makedocs`. ([#2557], [#2562], [#2571]) * Admonitions with category `todo` are now colored purple. Previously they were default-colored like all other unknown admonitions categories. ([#2526]) * A `checkdocs_ignored_modules` keyword argument to `makedocs(...)`, which prevents `checkdocs` from warning about missing documentation in certain modules. ([#2233]) @@ -1896,6 +1896,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 [#2560]: https://github.com/JuliaDocs/Documenter.jl/issues/2560 [#2561]: https://github.com/JuliaDocs/Documenter.jl/issues/2561 [#2562]: https://github.com/JuliaDocs/Documenter.jl/issues/2562 +[#2569]: https://github.com/JuliaDocs/Documenter.jl/issues/2569 +[#2571]: https://github.com/JuliaDocs/Documenter.jl/issues/2571 [JuliaLang/julia#36953]: https://github.com/JuliaLang/julia/issues/36953 [JuliaLang/julia#38054]: https://github.com/JuliaLang/julia/issues/38054 [JuliaLang/julia#39841]: https://github.com/JuliaLang/julia/issues/39841 diff --git a/src/docchecks.jl b/src/docchecks.jl index af8bed60b4..492ce20da4 100644 --- a/src/docchecks.jl +++ b/src/docchecks.jl @@ -191,8 +191,6 @@ function linkcheck(node::MarkdownAST.Node, element::MarkdownAST.AbstractElement, return nothing end -const _LINKCHECK_DEFAULT_USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36" - function linkcheck(node::MarkdownAST.Node, link::MarkdownAST.Link, doc::Document; method::Symbol=:HEAD) # first, make sure we're not supposed to ignore this link @@ -204,29 +202,7 @@ function linkcheck(node::MarkdownAST.Node, link::MarkdownAST.Link, doc::Document end if !haskey(doc.internal.locallinks, link) - timeout = doc.user.linkcheck_timeout - useragent = doc.user.linkcheck_useragent - null_file = @static Sys.iswindows() ? "nul" : "/dev/null" - # In some cases, web servers (e.g. docs.github.com as of 2022) will reject requests - # that declare a non-browser user agent (curl specifically passes 'curl/X.Y'). In - # case of docs.github.com, the server returns a 403 with a page saying "The request - # is blocked". However, spoofing a realistic browser User-Agent string is enough to - # get around this, and so here we simply pass the example Chrome UA string from the - # Mozilla developer docs, but only is it's a HTTP(S) request. - # - # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent#chrome_ua_string - fakebrowser = if startswith(uppercase(link.destination), "HTTP") - headers = [ - "-H", - "accept-encoding: gzip, deflate, br", - ] - if !isempty(useragent) - push!(headers, "--user-agent", useragent) - end - else - "" - end - cmd = `curl $(method === :HEAD ? "-sI" : "-s") --proto =http,https,ftp,ftps $(fakebrowser) $(link.destination) --max-time $timeout -o $null_file --write-out "%{http_code} %{url_effective} %{redirect_url}"` + cmd = _linkcheck_curl(method, link.destination; timeout=doc.user.linkcheck_timeout, useragent=doc.user.linkcheck_useragent) local result try @@ -279,10 +255,36 @@ function linkcheck(node::MarkdownAST.Node, docs_node::Documenter.DocsNode, doc:: end end - linkcheck_ismatch(r::String, url) = (url == r) linkcheck_ismatch(r::Regex, url) = occursin(r, url) +const _LINKCHECK_DEFAULT_USERAGENT = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36" + +function _linkcheck_curl(method::Symbol, url::AbstractString; timeout::Real, useragent::Union{AbstractString, Nothing}) + null_file = @static Sys.iswindows() ? "nul" : "/dev/null" + # In some cases, web servers (e.g. docs.github.com as of 2022) will reject requests + # that declare a non-browser user agent (curl specifically passes 'curl/X.Y'). In + # case of docs.github.com, the server returns a 403 with a page saying "The request + # is blocked". However, spoofing a realistic browser User-Agent string is enough to + # get around this, and so here we simply pass the example Chrome UA string from the + # Mozilla developer docs, but only is it's a HTTP(S) request. + # + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent#chrome_ua_string + fakebrowser = if startswith(uppercase(url), "HTTP") + headers = [ + "-H", + "accept-encoding: gzip, deflate, br", + ] + if !isnothing(useragent) + push!(headers, "--user-agent", useragent) + end + headers + else + String[] + end + return `curl $(method === :HEAD ? "-sI" : "-s") --proto =http,https,ftp,ftps $(fakebrowser) $(url) --max-time $timeout -o $null_file --write-out "%{http_code} %{url_effective} %{redirect_url}"` +end + # Automatic Pkg.add() GitHub remote check # --------------------------------------- diff --git a/src/documents.jl b/src/documents.jl index 05e9cf6b89..7a8a26c88f 100644 --- a/src/documents.jl +++ b/src/documents.jl @@ -312,7 +312,7 @@ struct User linkcheck::Bool # Check external links.. linkcheck_ignore::Vector{Union{String,Regex}} # ..and then ignore (some of) them. linkcheck_timeout::Real # ..but only wait this many seconds for each one. - linkcheck_useragent::String # User agent to use for linkchecks. + linkcheck_useragent::Union{String, Nothing} # User agent to use for linkchecks. checkdocs::Symbol # Check objects missing from `@docs` blocks. `:none`, `:exports`, or `:all`. checkdocs_ignored_modules::Vector{Module} # ..and then ignore (some of) them. doctestfilters::Vector{Regex} # Filtering for doctests @@ -387,7 +387,7 @@ function Document(; linkcheck:: Bool = false, linkcheck_ignore :: Vector = [], linkcheck_timeout :: Real = 10, - linkcheck_useragent :: String= _LINKCHECK_DEFAULT_USERAGENT, + linkcheck_useragent :: Union{AbstractString, Nothing} = _LINKCHECK_DEFAULT_USERAGENT, checkdocs::Symbol = :all, checkdocs_ignored_modules::Vector{Module} = Module[], doctestfilters::Vector{Regex}= Regex[], diff --git a/src/makedocs.jl b/src/makedocs.jl index dc3d44e737..f2ade22f58 100644 --- a/src/makedocs.jl +++ b/src/makedocs.jl @@ -201,13 +201,17 @@ ignored. return a response before giving up. The default is 10 seconds. **`linkcheck_useragent`** can be used to override the user agent string used by the HTTP and -HTTPS requests made when checking for broken links. Currently, the default user agent is +HTTPS requests made when checking for broken links. If set to `nothing`, it uses the default +user agent string of the library/tool used to actually perform the requests (currently, the +system's `curl` binary). + +If unset, Documenter uses the following user agent string: ``` $(_LINKCHECK_DEFAULT_USERAGENT) ``` -which is set to mimic a realistic web browser. However, the exact user agent string is subject +This is set to mimic a realistic web browser. However, the exact user agent string is subject to change. As such, it is possible that breakages can occur when Documenter's version changes, but the goal is to set the user agent such that it would be accepted by as many web servers as possible. diff --git a/test/online_linkcheck.jl b/test/online_linkcheck.jl index 7166a449cb..139ae6ed4d 100644 --- a/test/online_linkcheck.jl +++ b/test/online_linkcheck.jl @@ -16,7 +16,6 @@ using Test [FTP (no proto) success](ftp.iana.org/tz/data/etcetera) [Redirect success](google.com) [HEAD fail GET success](https://codecov.io/gh/invenia/LibPQ.jl) - [Linkcheck old Chrome UA fail](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) """ ) doc = Documenter.Document(; linkcheck=true, linkcheck_timeout=20) @@ -25,6 +24,27 @@ using Test @test doc.internal.errors == Set{Symbol}() end + @testset "Empty User-Agent" begin + src = convert( + MarkdownAST.Node, + md""" + [Linkcheck Empty UA](https://www.intel.com/content/www/us/en/developer/tools/oneapi/mpi-library.html) + """ + ) + + # The default user-agent fails (intel servers block it) + doc = Documenter.Document(; linkcheck=true, linkcheck_timeout=20) + doc.blueprint.pages["testpage"] = Documenter.Page("", "", "", [], Documenter.Globals(), src) + @test_logs (:error,) @test linkcheck(doc) === nothing + @test doc.internal.errors == Set{Symbol}([:linkcheck]) + + # You can work around by setting linkcheck_useragent=nothing and defaulting to the Curl's user agent + doc = Documenter.Document(; linkcheck=true, linkcheck_timeout=20, linkcheck_useragent=nothing) + doc.blueprint.pages["testpage"] = Documenter.Page("", "", "", [], Documenter.Globals(), src) + @test linkcheck(doc) === nothing + @test doc.internal.errors == Set{Symbol}() + end + @testset "Failures" begin src = convert(MarkdownAST.Node, Markdown.parse("[FILE failure](file://$(@__FILE__))")) doc = Documenter.Document(; linkcheck=true)