diff --git a/bin/archive b/bin/archive index 057c311..47c23f6 100755 --- a/bin/archive +++ b/bin/archive @@ -1,9 +1,51 @@ #!/usr/bin/env ruby -require "cgi" require "uri" require "digest" require "time" +require "nokogiri" +require "ferrum" + +def absolutize_url(url, base_url) + return url if url.nil? || url.empty? + return url if url.match?(/\A(?:data|javascript|mailto|tel|about):/i) + + URI.join(base_url, url).to_s +rescue URI::InvalidURIError + url +end + +def absolutize_srcset(srcset, base_url) + srcset.split(",").map do |entry| + parts = entry.strip.split(/\s+/, 2) + next if parts.empty? + + src = absolutize_url(parts[0], base_url) + descriptor = parts[1] + + [src, descriptor].compact.join(" ") + end.compact.join(", ") +end + +def absolutize_links!(doc, base_url) + %w[href src poster].each do |attr| + doc.css("[#{attr}]").each do |node| + node[attr] = absolutize_url(node[attr].to_s.strip, base_url) + end + end + + doc.css("[srcset]").each do |node| + node["srcset"] = absolutize_srcset(node["srcset"].to_s.strip, base_url) + end +end + +def text_from_html(html) + IO.popen(["w3m", "-dump", "-T", "text/html", "-o", "display_link_number=1"], "r+") do |io| + io.write(html) + io.close_write + io.read + end +end *urls = ARGV clipboard = "" @@ -15,43 +57,55 @@ end puts "references:" -urls.each do |url| - page_content = `curl -s #{url}` - text_content = `w3m -dump -T text/html -o display_link_number=1 #{url}` +browser = Ferrum::Browser.new( + timeout: 30, + process_timeout: 30, + browser_options: { "no-sandbox": nil } +) +page = browser.create_page - begin - title = CGI.unescapeHTML( - page_content - .scan(/