1#!/usr/bin/env python3 2# coding: utf-8 3 4""" 5Copyright (c) 2024 Huawei Device Co., Ltd. 6Licensed under the Apache License, Version 2.0 (the "License"); 7you may not use this file except in compliance with the License. 8You may obtain a copy of the License at 9 10 http://www.apache.org/licenses/LICENSE-2.0 11 12Unless required by applicable law or agreed to in writing, software 13distributed under the License is distributed on an "AS IS" BASIS, 14WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15See the License for the specific language governing permissions and 16limitations under the License. 17 18Description: utils for test suite 19""" 20import argparse 21import json 22import os 23import stat 24import time as times 25from datetime import datetime, timedelta, time 26 27import requests 28import yaml 29from lxml import etree 30 31from result import get_result 32 33 34def parse_config(): 35 config_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../config.yaml') 36 with open(config_file_path, 'r', encoding='utf-8') as config_file: 37 configs = yaml.safe_load(config_file) 38 return configs 39 40 41def get_url(name, page): 42 url_prefix = 'https://gitee.com/openharmony/' 43 url_suffix = f'/pulls?assignee_id=&author_id=&label_ids=&label_text=&milestone_id=&page={page}' \ 44 f'&priority=&project_type=&scope=&search=&single_label_id=&single_label_text=&' \ 45 f'sort=created_at+desc&status=merged&target_project=&tester_id=' 46 url = url_prefix + name + url_suffix 47 48 return url 49 50 51def get_html(url): 52 headers = { 53 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' 54 ' (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3' 55 } 56 try: 57 response = requests.get(url, headers=headers) 58 if response.status_code == 200: 59 return response.text 60 except Exception: 61 print("Failed to request the page") 62 return '' 63 return '' 64 65 66def write_data(repo_name, data_file, title, committer, commit_time_str, pr_link): 67 data = { 68 'repo_name': repo_name, 69 'title': title, 70 'committer': committer, 71 'commit_time_str': commit_time_str, 72 'pr_link': pr_link 73 } 74 flags = os.O_WRONLY | os.O_CREAT 75 mode = stat.S_IWUSR | stat.S_IRUSR 76 with os.fdopen(os.open(data_file, flags, mode), 'a', encoding='utf-8') as file: 77 json.dump(data, file, ensure_ascii=False) 78 file.write('\n') 79 80 81def get_commit_records(repo_name, commit_start_time, commit_end_time): 82 data_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data.txt') 83 current_data_count = 0 84 page = 1 85 is_continue = True 86 while is_continue: 87 url = get_url(repo_name, str(page)) 88 html = get_html(url) 89 tree = etree.HTML(html) 90 commit_list = tree.xpath('/html/body/div[2]/div[2]/div[2]/div[2]/div') 91 if not commit_list: 92 break 93 for commit_task in commit_list: 94 title = commit_task.xpath('.//div[1]/a/text()')[0] 95 committer = commit_task.xpath('.//div[3]/span[2]/a/span/text()')[0] 96 commit_time_str = commit_task.xpath('.//div[3]/span[4]/span/text()')[0].strip() 97 pr_link = commit_task.xpath('.//div[1]/a/@href')[0] 98 commit_time = datetime.strptime(commit_time_str, '%Y-%m-%d %H:%M') 99 if commit_start_time <= commit_time <= commit_end_time: 100 current_data_count = current_data_count + 1 101 write_data(repo_name, data_file, title, committer, commit_time_str, pr_link) 102 if commit_time < commit_start_time: 103 is_continue = False 104 page += 1 105 106 if current_data_count == 0: 107 print(f"repo {repo_name} no commit records were found within the specified time range") 108 failed_message = (f'this repo has no commit record from {commit_start_time}' 109 f' to {commit_end_time.strftime("%Y-%m-%d %H:%M:%S")}') 110 write_data(repo_name, data_file, failed_message, None, None, None) 111 112 return current_data_count 113 114 115def retry_after_crawl_failed(repo_list, commit_start_time, commit_end_time): 116 max_retries = 5 117 try_in = 2 * 60 118 data_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data.txt') 119 if os.path.exists(data_file): 120 os.remove(data_file) 121 for i in range(max_retries): 122 try: 123 data_count = 0 124 for repo_name in repo_list: 125 current_data_count = get_commit_records(repo_name, commit_start_time, commit_end_time) 126 data_count = data_count + current_data_count 127 print(f'The data was successfully obtained, a total of {data_count} commit records were retrieved') 128 print(f'Data statistics from {commit_start_time} to {commit_end_time.strftime("%Y-%m-%d %H:%M:%S")}' 129 f' were successfully retrieved') 130 return True 131 except Exception: 132 print(f"get data failed! retrying... ({i + 1}/{max_retries})") 133 times.sleep(try_in) 134 135 return False 136 137 138def parse_args(): 139 parser = argparse.ArgumentParser() 140 parser.add_argument('--startTime', type=str, dest='start_time', default=None, 141 help='specify crawl start time') 142 parser.add_argument('--commitRepo', type=str, dest='commit_repo', default=None, 143 nargs='+', 144 help='get commit message in those repos') 145 return parser.parse_args() 146 147 148def clean_log(): 149 commit_log_html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 150 'commit_log.html') 151 if os.path.exists(commit_log_html_path): 152 os.remove(commit_log_html_path) 153 154 155def run(): 156 clean_log() 157 repo_list_configs = parse_config() 158 end_time = datetime.now() 159 yesterday = datetime.now() - timedelta(days=1) 160 start_time = datetime(yesterday.year, yesterday.month, yesterday.day, 0, 0, 0) 161 repo_list = repo_list_configs.get('repo_list') 162 163 arguments = parse_args() 164 commit_start_time = repo_list_configs.get('commit_start_time') if arguments.start_time is None \ 165 else arguments.start_time 166 if commit_start_time is not None: 167 time_str = datetime.strptime(commit_start_time, '%Y-%m-%d') 168 start_time = datetime.combine(time_str, time.min) 169 end_time = start_time + timedelta(days=1) 170 171 success = retry_after_crawl_failed(repo_list, start_time, end_time) 172 if not success: 173 print("Maximum retries reached, failed to crawl the data") 174 else: 175 get_result() 176 177 178if __name__ == '__main__': 179 run()