utils/commit_message/get_commit_message.py

#!/usr/bin/env python3
# coding: utf-8

"""
Copyright (c) 2024 Huawei Device Co., Ltd.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

Description: utils for test suite
"""
import argparse
import json
import os
import stat
import time as times
from datetime import datetime, timedelta, time

import requests
import yaml
from lxml import etree

from result import get_result


def parse_config():
    config_file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), '../config.yaml')
    with open(config_file_path, 'r', encoding='utf-8') as config_file:
        configs = yaml.safe_load(config_file)
    return configs


def get_url(name, page):
    url_prefix = 'https://gitee.com/openharmony/'
    url_suffix = f'/pulls?assignee_id=&author_id=&label_ids=&label_text=&milestone_id=&page={page}' \
                 f'&priority=&project_type=&scope=&search=&single_label_id=&single_label_text=&' \
                 f'sort=created_at+desc&status=merged&target_project=&tester_id='
    url = url_prefix + name + url_suffix

    return url


def get_html(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                      ' (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
    except Exception:
        print("Failed to request the page")
        return ''
    return ''


def write_data(repo_name, data_file, title, committer, commit_time_str, pr_link):
    data = {
        'repo_name': repo_name,
        'title': title,
        'committer': committer,
        'commit_time_str': commit_time_str,
        'pr_link': pr_link
    }
    flags = os.O_WRONLY | os.O_CREAT
    mode = stat.S_IWUSR | stat.S_IRUSR
    with os.fdopen(os.open(data_file, flags, mode), 'a', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False)
        file.write('\n')


def get_commit_records(repo_name, commit_start_time, commit_end_time):
    data_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data.txt')
    current_data_count = 0
    page = 1
    is_continue = True
    while is_continue:
        url = get_url(repo_name, str(page))
        html = get_html(url)
        tree = etree.HTML(html)
        commit_list = tree.xpath('/html/body/div[2]/div[2]/div[2]/div[2]/div')
        if not commit_list:
            break
        for commit_task in commit_list:
            title = commit_task.xpath('.//div[1]/a/text()')[0]
            committer = commit_task.xpath('.//div[3]/span[2]/a/span/text()')[0]
            commit_time_str = commit_task.xpath('.//div[3]/span[4]/span/text()')[0].strip()
            pr_link = commit_task.xpath('.//div[1]/a/@href')[0]
            commit_time = datetime.strptime(commit_time_str, '%Y-%m-%d %H:%M')
            if commit_start_time <= commit_time <= commit_end_time:
                current_data_count = current_data_count + 1
                write_data(repo_name, data_file, title, committer, commit_time_str, pr_link)
            if commit_time < commit_start_time:
                is_continue = False
        page += 1

    if current_data_count == 0:
        print(f"repo {repo_name} no commit records were found within the specified time range")
        failed_message = (f'this repo has no commit record from {commit_start_time}'
                          f' to {commit_end_time.strftime("%Y-%m-%d %H:%M:%S")}')
        write_data(repo_name, data_file, failed_message, None, None, None)

    return current_data_count


def retry_after_crawl_failed(repo_list, commit_start_time, commit_end_time):
    max_retries = 5
    try_in = 2 * 60
    data_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data.txt')
    if os.path.exists(data_file):
        os.remove(data_file)
    for i in range(max_retries):
        try:
            data_count = 0
            for repo_name in repo_list:
                current_data_count = get_commit_records(repo_name, commit_start_time, commit_end_time)
                data_count = data_count + current_data_count
            print(f'The data was successfully obtained, a total of {data_count} commit records were retrieved')
            print(f'Data statistics from {commit_start_time} to {commit_end_time.strftime("%Y-%m-%d %H:%M:%S")}'
                  f' were successfully retrieved')
            return True
        except Exception:
            print(f"get data failed! retrying... ({i + 1}/{max_retries})")
            times.sleep(try_in)

    return False


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--startTime', type=str, dest='start_time', default=None,
                        help='specify crawl start time')
    parser.add_argument('--commitRepo', type=str, dest='commit_repo', default=None,
                        nargs='+',
                        help='get commit message in those repos')
    return parser.parse_args()


def clean_log():
    commit_log_html_path = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                    'commit_log.html')
    if os.path.exists(commit_log_html_path):
        os.remove(commit_log_html_path)


def run():
    clean_log()
    repo_list_configs = parse_config()
    end_time = datetime.now()
    yesterday = datetime.now() - timedelta(days=1)
    start_time = datetime(yesterday.year, yesterday.month, yesterday.day, 0, 0, 0)
    repo_list = repo_list_configs.get('repo_list')

    arguments = parse_args()
    commit_start_time = repo_list_configs.get('commit_start_time') if arguments.start_time is None \
        else arguments.start_time
    if commit_start_time is not None:
        time_str = datetime.strptime(commit_start_time, '%Y-%m-%d')
        start_time = datetime.combine(time_str, time.min)
        end_time = start_time + timedelta(days=1)

    success = retry_after_crawl_failed(repo_list, start_time, end_time)
    if not success:
        print("Maximum retries reached, failed to crawl the data")
    else:
        get_result()


if __name__ == '__main__':
    run()