Update archive script to run headless browser
This commit is contained in:
78
bin/archive
78
bin/archive
@@ -1,9 +1,51 @@
|
|||||||
#!/usr/bin/env ruby
|
#!/usr/bin/env ruby
|
||||||
|
|
||||||
require "cgi"
|
|
||||||
require "uri"
|
require "uri"
|
||||||
require "digest"
|
require "digest"
|
||||||
require "time"
|
require "time"
|
||||||
|
require "nokogiri"
|
||||||
|
require "ferrum"
|
||||||
|
|
||||||
|
def absolutize_url(url, base_url)
|
||||||
|
return url if url.nil? || url.empty?
|
||||||
|
return url if url.match?(/\A(?:data|javascript|mailto|tel|about):/i)
|
||||||
|
|
||||||
|
URI.join(base_url, url).to_s
|
||||||
|
rescue URI::InvalidURIError
|
||||||
|
url
|
||||||
|
end
|
||||||
|
|
||||||
|
def absolutize_srcset(srcset, base_url)
|
||||||
|
srcset.split(",").map do |entry|
|
||||||
|
parts = entry.strip.split(/\s+/, 2)
|
||||||
|
next if parts.empty?
|
||||||
|
|
||||||
|
src = absolutize_url(parts[0], base_url)
|
||||||
|
descriptor = parts[1]
|
||||||
|
|
||||||
|
[src, descriptor].compact.join(" ")
|
||||||
|
end.compact.join(", ")
|
||||||
|
end
|
||||||
|
|
||||||
|
def absolutize_links!(doc, base_url)
|
||||||
|
%w[href src poster].each do |attr|
|
||||||
|
doc.css("[#{attr}]").each do |node|
|
||||||
|
node[attr] = absolutize_url(node[attr].to_s.strip, base_url)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
doc.css("[srcset]").each do |node|
|
||||||
|
node["srcset"] = absolutize_srcset(node["srcset"].to_s.strip, base_url)
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
|
def text_from_html(html)
|
||||||
|
IO.popen(["w3m", "-dump", "-T", "text/html", "-o", "display_link_number=1"], "r+") do |io|
|
||||||
|
io.write(html)
|
||||||
|
io.close_write
|
||||||
|
io.read
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
*urls = ARGV
|
*urls = ARGV
|
||||||
clipboard = ""
|
clipboard = ""
|
||||||
@@ -15,20 +57,29 @@ end
|
|||||||
|
|
||||||
puts "references:"
|
puts "references:"
|
||||||
|
|
||||||
urls.each do |url|
|
browser = Ferrum::Browser.new(
|
||||||
page_content = `curl -s #{url}`
|
timeout: 30,
|
||||||
text_content = `w3m -dump -T text/html -o display_link_number=1 #{url}`
|
process_timeout: 30,
|
||||||
|
browser_options: { "no-sandbox": nil }
|
||||||
|
)
|
||||||
|
page = browser.create_page
|
||||||
|
|
||||||
|
begin
|
||||||
|
urls.each do |url|
|
||||||
begin
|
begin
|
||||||
title = CGI.unescapeHTML(
|
page.goto(url)
|
||||||
page_content
|
page.network.wait_for_idle(timeout: 10)
|
||||||
.scan(/<title[^>]*>(.*?)<\/title>/mi)
|
|
||||||
.first
|
html = page.body.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
||||||
.first
|
doc = Nokogiri::HTML(html)
|
||||||
.strip
|
absolutize_links!(doc, url)
|
||||||
)
|
|
||||||
|
title = doc.at("title")&.text&.strip
|
||||||
|
raise "No title found" if title.to_s.empty?
|
||||||
|
|
||||||
|
text_content = text_from_html(doc.to_html)
|
||||||
rescue => ex
|
rescue => ex
|
||||||
warn "Title error (#{ex}; #{url})"
|
warn "Archive error (#{ex}; #{url})"
|
||||||
exit 1
|
exit 1
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -52,6 +103,9 @@ urls.each do |url|
|
|||||||
puts yaml
|
puts yaml
|
||||||
|
|
||||||
clipboard += yaml
|
clipboard += yaml
|
||||||
|
end
|
||||||
|
ensure
|
||||||
|
browser.quit
|
||||||
end
|
end
|
||||||
|
|
||||||
IO.popen("pbcopy", "w") { |pb| pb.write(clipboard) }
|
IO.popen("pbcopy", "w") { |pb| pb.write(clipboard) }
|
||||||
|
|||||||
Reference in New Issue
Block a user