#!/usr/bin/env ruby require "uri" require "digest" require "time" require "nokogiri" require "ferrum" def absolutize_url(url, base_url) return url if url.nil? || url.empty? return url if url.match?(/\A(?:data|javascript|mailto|tel|about):/i) URI.join(base_url, url).to_s rescue URI::InvalidURIError url end def absolutize_srcset(srcset, base_url) srcset.split(",").map do |entry| parts = entry.strip.split(/\s+/, 2) next if parts.empty? src = absolutize_url(parts[0], base_url) descriptor = parts[1] [src, descriptor].compact.join(" ") end.compact.join(", ") end def absolutize_links!(doc, base_url) %w[href src poster].each do |attr| doc.css("[#{attr}]").each do |node| node[attr] = absolutize_url(node[attr].to_s.strip, base_url) end end doc.css("[srcset]").each do |node| node["srcset"] = absolutize_srcset(node["srcset"].to_s.strip, base_url) end end def text_from_html(html) IO.popen(["w3m", "-dump", "-T", "text/html", "-o", "display_link_number=1"], "r+") do |io| io.write(html) io.close_write io.read end end *urls = ARGV clipboard = "" unless urls.any? warn "Please supply one or more URLs" exit 1 end puts "references:" browser = Ferrum::Browser.new( timeout: 30, process_timeout: 30, browser_options: { "no-sandbox": nil } ) page = browser.create_page begin urls.each do |url| begin page.goto(url) page.network.wait_for_idle(timeout: 10) html = page.body.encode("UTF-8", invalid: :replace, undef: :replace, replace: "") doc = Nokogiri::HTML(html) absolutize_links!(doc, url) title = doc.at("title")&.text&.strip raise "No title found" if title.to_s.empty? text_content = text_from_html(doc.to_html) rescue => ex warn "Archive error (#{ex}; #{url})" exit 1 end hash = Digest::MD5.base64digest(url + text_content) .scan(/[a-z0-9]/i) .first(6) .join .downcase filename = "#{URI.parse(url).host.gsub(".", "-")}-#{hash}.txt" File.write("static/archive/#{filename}", text_content) yaml = <<~STR - title: "#{title}" url: #{url} date: #{Time.now.utc.iso8601} file: #{filename} STR puts yaml clipboard += yaml end ensure browser.quit end IO.popen("pbcopy", "w") { |pb| pb.write(clipboard) }