diff --git a/bin/archive b/bin/archive index 057c311..47c23f6 100755 --- a/bin/archive +++ b/bin/archive @@ -1,9 +1,51 @@ #!/usr/bin/env ruby -require "cgi" require "uri" require "digest" require "time" +require "nokogiri" +require "ferrum" + +def absolutize_url(url, base_url) + return url if url.nil? || url.empty? + return url if url.match?(/\A(?:data|javascript|mailto|tel|about):/i) + + URI.join(base_url, url).to_s +rescue URI::InvalidURIError + url +end + +def absolutize_srcset(srcset, base_url) + srcset.split(",").map do |entry| + parts = entry.strip.split(/\s+/, 2) + next if parts.empty? + + src = absolutize_url(parts[0], base_url) + descriptor = parts[1] + + [src, descriptor].compact.join(" ") + end.compact.join(", ") +end + +def absolutize_links!(doc, base_url) + %w[href src poster].each do |attr| + doc.css("[#{attr}]").each do |node| + node[attr] = absolutize_url(node[attr].to_s.strip, base_url) + end + end + + doc.css("[srcset]").each do |node| + node["srcset"] = absolutize_srcset(node["srcset"].to_s.strip, base_url) + end +end + +def text_from_html(html) + IO.popen(["w3m", "-dump", "-T", "text/html", "-o", "display_link_number=1"], "r+") do |io| + io.write(html) + io.close_write + io.read + end +end *urls = ARGV clipboard = "" @@ -15,43 +57,55 @@ end puts "references:" -urls.each do |url| - page_content = `curl -s #{url}` - text_content = `w3m -dump -T text/html -o display_link_number=1 #{url}` +browser = Ferrum::Browser.new( + timeout: 30, + process_timeout: 30, + browser_options: { "no-sandbox": nil } +) +page = browser.create_page - begin - title = CGI.unescapeHTML( - page_content - .scan(/]*>(.*?)<\/title>/mi) - .first - .first - .strip - ) - rescue => ex - warn "Title error (#{ex}; #{url})" - exit 1 +begin + urls.each do |url| + begin + page.goto(url) + page.network.wait_for_idle(timeout: 10) + + html = page.body.encode("UTF-8", invalid: :replace, undef: :replace, replace: "") + doc = Nokogiri::HTML(html) + absolutize_links!(doc, url) + + title = doc.at("title")&.text&.strip + raise "No title found" if title.to_s.empty? + + text_content = text_from_html(doc.to_html) + rescue => ex + warn "Archive error (#{ex}; #{url})" + exit 1 + end + + hash = Digest::MD5.base64digest(url + text_content) + .scan(/[a-z0-9]/i) + .first(6) + .join + .downcase + + filename = "#{URI.parse(url).host.gsub(".", "-")}-#{hash}.txt" + + File.write("static/archive/#{filename}", text_content) + + yaml = <<~STR + - title: "#{title}" + url: #{url} + date: #{Time.now.utc.iso8601} + file: #{filename} + STR + + puts yaml + + clipboard += yaml end - - hash = Digest::MD5.base64digest(url + text_content) - .scan(/[a-z0-9]/i) - .first(6) - .join - .downcase - - filename = "#{URI.parse(url).host.gsub(".", "-")}-#{hash}.txt" - - File.write("static/archive/#{filename}", text_content) - - yaml = <<~STR - - title: "#{title}" - url: #{url} - date: #{Time.now.utc.iso8601} - file: #{filename} - STR - - puts yaml - - clipboard += yaml +ensure + browser.quit end IO.popen("pbcopy", "w") { |pb| pb.write(clipboard) }