Update archive script to run headless browser

This commit is contained in:
David Eisinger
2026-04-02 00:07:25 -04:00
parent 9368e1ec38
commit da1673d49d

View File

@@ -1,9 +1,51 @@
#!/usr/bin/env ruby #!/usr/bin/env ruby
require "cgi"
require "uri" require "uri"
require "digest" require "digest"
require "time" require "time"
require "nokogiri"
require "ferrum"
def absolutize_url(url, base_url)
return url if url.nil? || url.empty?
return url if url.match?(/\A(?:data|javascript|mailto|tel|about):/i)
URI.join(base_url, url).to_s
rescue URI::InvalidURIError
url
end
def absolutize_srcset(srcset, base_url)
srcset.split(",").map do |entry|
parts = entry.strip.split(/\s+/, 2)
next if parts.empty?
src = absolutize_url(parts[0], base_url)
descriptor = parts[1]
[src, descriptor].compact.join(" ")
end.compact.join(", ")
end
def absolutize_links!(doc, base_url)
%w[href src poster].each do |attr|
doc.css("[#{attr}]").each do |node|
node[attr] = absolutize_url(node[attr].to_s.strip, base_url)
end
end
doc.css("[srcset]").each do |node|
node["srcset"] = absolutize_srcset(node["srcset"].to_s.strip, base_url)
end
end
def text_from_html(html)
IO.popen(["w3m", "-dump", "-T", "text/html", "-o", "display_link_number=1"], "r+") do |io|
io.write(html)
io.close_write
io.read
end
end
*urls = ARGV *urls = ARGV
clipboard = "" clipboard = ""
@@ -15,20 +57,29 @@ end
puts "references:" puts "references:"
urls.each do |url| browser = Ferrum::Browser.new(
page_content = `curl -s #{url}` timeout: 30,
text_content = `w3m -dump -T text/html -o display_link_number=1 #{url}` process_timeout: 30,
browser_options: { "no-sandbox": nil }
)
page = browser.create_page
begin
urls.each do |url|
begin begin
title = CGI.unescapeHTML( page.goto(url)
page_content page.network.wait_for_idle(timeout: 10)
.scan(/<title[^>]*>(.*?)<\/title>/mi)
.first html = page.body.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
.first doc = Nokogiri::HTML(html)
.strip absolutize_links!(doc, url)
)
title = doc.at("title")&.text&.strip
raise "No title found" if title.to_s.empty?
text_content = text_from_html(doc.to_html)
rescue => ex rescue => ex
warn "Title error (#{ex}; #{url})" warn "Archive error (#{ex}; #{url})"
exit 1 exit 1
end end
@@ -52,6 +103,9 @@ urls.each do |url|
puts yaml puts yaml
clipboard += yaml clipboard += yaml
end
ensure
browser.quit
end end
IO.popen("pbcopy", "w") { |pb| pb.write(clipboard) } IO.popen("pbcopy", "w") { |pb| pb.write(clipboard) }