1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
|
import re import logging import subprocess from io import StringIO from urllib import request
domain = 'www.example.com'
token = 'xxxxxxxxxxxxxxxxx'
site_map_url = 'https://www.example.com/sitemap.xml'
push_max_lines = 1000
push_urls_file = "/tmp/baidu_zhanzhang_push_url.txt"
push_url = 'http://data.zz.baidu.com/urls?site={domain}&token={token}'.format(domain=domain, token=token)
log_file = "/tmp/baidu/baidu_zhanzhang_push.log"
def regexpMatchUrl(content): pattern = re.findall(r'(http|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&:/~\+#]*[\w\-\@?^=%&/~\+#])?', content, re.IGNORECASE) if pattern: return True else: return False
def regexpMatchWebSite(content): pattern = re.findall(r''.join(domain), content, re.IGNORECASE) if pattern: return True else: return False
def getUrl(content): pattern = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+.html', content, re.IGNORECASE) if pattern: return pattern[0] else: return ''
def createUrlFile(url_file_path, max_lines): content = request.urlopen(site_map_url).read().decode('utf8') website_map_file = StringIO(content) url_file = open(url_file_path, 'w') index = 0 for line in website_map_file: if(regexpMatchUrl(line) and regexpMatchWebSite(line)): url = getUrl(line) if(url != ''): index = index + 1 url_file.writelines(url + "\n") if(index >= max_lines): break url_file.close() website_map_file.close()
def pushUrlFile(url, url_file_path, log_file): shell_cmd_line = "curl -H 'Content-Type:text/plain' --data-binary @" + url_file_path + " " + '\"' + url + '\"' (status, output) = subprocess.getstatusoutput(shell_cmd_line) logging.info(output + "\n")
if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', filename=log_file) createUrlFile(push_urls_file, push_max_lines) pushUrlFile(push_url, push_urls_file, log_file)
|