官网:https://www.kanxue.com/

使用post httparty

cat scripts/get_blogs_from_kanxue_use_httparty.rb
ENV['RAILS_ENV'] = ARGV.first || ENV['RAILS_ENV'] || 'development'
require File.expand_path(File.dirname(__FILE__) + "/../config/environment")
require 'rails'
require 'json'
require 'rubygems'
require 'httparty'
require 'nokogiri'
Rails.logger = Logger.new("log/update_kanxue_blogs_using_httparty.log")
i = 19
loop do
  url = "https://www.kanxue.com/homepost-morearticle.htm"
  query = {
    "page" => "#{i}",
    "pagesize" => "10"
  }
  headers = {
    "Host"  => "www.kanxue.com",
    "User-Agent" => "Mozilla/5.0 (X11; Linux x86_64; rb:108.0) Gecko/20100101 Firefox/108.0",
    "Accept" => "text/plain, */*; q=0.01",
    "Accept-Language" => "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
    "Accept-Encoding" => "gzip, deflate, br",
    "Referer" => "https://www.kanxue.com/",
    "Content-Type" => "application/x-www-form-urlencoded; charset=UTF-8",
    "X-Requested-With" => "XMLHttpRequest",
    "Content-Length" => "18",
    "Origin" => "https://www.kanxue.com",
    "Connection" => "keep-alive",
    "Cookie" => "__jsluid_s=61f26b9d4fbbe7b03612225e7a61b899; Hm_lvt_820e73ad7ccba42be0e5b528c537e327=1666926727,1667283362; __jsluid_h=dfdce607d78e6fd261a8a092f8d3eb39; tree_expandedNodes=%2C1001600%2C1001671%2C1000000%2C1000001%2C1000002%2C1001202%2C1000713%2C1001372%2C1001373%2C1000003%2C1000965%2C1001189%2C1000966%2C1000972%2C1001522%2C; PHPSESSID=278ff8bf0af353e637c64377aad1716c",
    "Sec-Fetch-Dest" => "empty",
    "Sec-Fetch-Mode" => "cors",
    "Sec-Fetch-Site" => "same-origin",
    "DNT": "1"
  }
  Rails.logger.info "==== step 1 httparty post, query: #{query}, url: #{url}"
  response = HTTParty.post(
    "#{url}",
    :query => query,
    :headers => headers
  )
  result = JSON.parse(response)
  Rails.logger.info "=== step 2 after post, result: #{result} response.code, #{response.code} ===response.headers is #{response.headers} query: #{query}"

 

  blogs = result['message']['list']
  blog_local = ''
  if blogs.present?
    blogs.each do |blog|
      Rails.logger.info "=== step 3 before if blog_url #{blog['source_url'].inspect}"
      if blog['source_url'].present?
        Rails.logger.info "==== step 3.1 in present blog_url: #{blog['source_url']} title: #{blog['subject']} created_at: #{blog['create_date_fmt']}"
        blog_local = Blog.where('blog_url = ? and source_website = ?', blog['source_url'], 'kanxue').first
        if blog_local.blank?
          blog_local = Blog.create blog_url: blog['source_url'], title: blog['subject'], created_at: blog['create_date_fmt'], author: blog['userinfo']['username'], image_url: "https://www.kanxue.com/#{blog['pic_fmt']}", brief: blog['brief'], views: blog['views'].to_i, source_website: 'kanxue'
          sleep 10
        end
      else
        blog_url = "https://zhuanlan.kanxue.com/article-#{blog['articleid']}.htm"
        Rails.logger.info "==== step 3.2 in else blog_url: #{blog_url} title: #{blog['subject']} created_at: #{blog['create_date_fmt']}"
        blog_local = Blog.where('blog_url = ? and source_website = ?', blog_url, 'kanxue').first
        if blog_local.blank?
          blog_local = Blog.create blog_url: blog_url, title: blog['subject'], created_at: blog['create_date_fmt'], author: blog['userinfo']['username'], image_url: "https://www.kanxue.com/#{blog['pic_fmt']}", brief: blog['brief'], views: blog['views'].to_i, source_website: 'kanxue'
          sleep 10
        end
      end       category_names = blog['tags'].split(',')
      if category_names.present?
        category_names.each do |category_name|
          category_local = Category.where('name = ? and blog_id = ?', category_name, blog_local.id).first
          Rails.logger.info "==== step 4 in create category, category_local is #{category_local.inspect} category_name: #{category_name} blog_local.id #{blog_local.id}"
          Category.create name: category_name, blog_id: blog_local.id if category_local.blank?
        end
      end
    end
  end
  sleep 50
  i = i + 1
  if i > 1650
    break
  end end