Update archive script to run headless browser
This commit is contained in:
78
bin/archive
78
bin/archive
@@ -1,9 +1,51 @@
|
||||
#!/usr/bin/env ruby
|
||||
|
||||
require "cgi"
|
||||
require "uri"
|
||||
require "digest"
|
||||
require "time"
|
||||
require "nokogiri"
|
||||
require "ferrum"
|
||||
|
||||
def absolutize_url(url, base_url)
|
||||
return url if url.nil? || url.empty?
|
||||
return url if url.match?(/\A(?:data|javascript|mailto|tel|about):/i)
|
||||
|
||||
URI.join(base_url, url).to_s
|
||||
rescue URI::InvalidURIError
|
||||
url
|
||||
end
|
||||
|
||||
def absolutize_srcset(srcset, base_url)
|
||||
srcset.split(",").map do |entry|
|
||||
parts = entry.strip.split(/\s+/, 2)
|
||||
next if parts.empty?
|
||||
|
||||
src = absolutize_url(parts[0], base_url)
|
||||
descriptor = parts[1]
|
||||
|
||||
[src, descriptor].compact.join(" ")
|
||||
end.compact.join(", ")
|
||||
end
|
||||
|
||||
def absolutize_links!(doc, base_url)
|
||||
%w[href src poster].each do |attr|
|
||||
doc.css("[#{attr}]").each do |node|
|
||||
node[attr] = absolutize_url(node[attr].to_s.strip, base_url)
|
||||
end
|
||||
end
|
||||
|
||||
doc.css("[srcset]").each do |node|
|
||||
node["srcset"] = absolutize_srcset(node["srcset"].to_s.strip, base_url)
|
||||
end
|
||||
end
|
||||
|
||||
def text_from_html(html)
|
||||
IO.popen(["w3m", "-dump", "-T", "text/html", "-o", "display_link_number=1"], "r+") do |io|
|
||||
io.write(html)
|
||||
io.close_write
|
||||
io.read
|
||||
end
|
||||
end
|
||||
|
||||
*urls = ARGV
|
||||
clipboard = ""
|
||||
@@ -15,20 +57,29 @@ end
|
||||
|
||||
puts "references:"
|
||||
|
||||
urls.each do |url|
|
||||
page_content = `curl -s #{url}`
|
||||
text_content = `w3m -dump -T text/html -o display_link_number=1 #{url}`
|
||||
browser = Ferrum::Browser.new(
|
||||
timeout: 30,
|
||||
process_timeout: 30,
|
||||
browser_options: { "no-sandbox": nil }
|
||||
)
|
||||
page = browser.create_page
|
||||
|
||||
begin
|
||||
title = CGI.unescapeHTML(
|
||||
page_content
|
||||
.scan(/<title[^>]*>(.*?)<\/title>/mi)
|
||||
.first
|
||||
.first
|
||||
.strip
|
||||
)
|
||||
urls.each do |url|
|
||||
begin
|
||||
page.goto(url)
|
||||
page.network.wait_for_idle(timeout: 10)
|
||||
|
||||
html = page.body.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
|
||||
doc = Nokogiri::HTML(html)
|
||||
absolutize_links!(doc, url)
|
||||
|
||||
title = doc.at("title")&.text&.strip
|
||||
raise "No title found" if title.to_s.empty?
|
||||
|
||||
text_content = text_from_html(doc.to_html)
|
||||
rescue => ex
|
||||
warn "Title error (#{ex}; #{url})"
|
||||
warn "Archive error (#{ex}; #{url})"
|
||||
exit 1
|
||||
end
|
||||
|
||||
@@ -53,5 +104,8 @@ urls.each do |url|
|
||||
|
||||
clipboard += yaml
|
||||
end
|
||||
ensure
|
||||
browser.quit
|
||||
end
|
||||
|
||||
IO.popen("pbcopy", "w") { |pb| pb.write(clipboard) }
|
||||
|
||||
Reference in New Issue
Block a user