diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..da77629 --- /dev/null +++ b/.env.example @@ -0,0 +1,9 @@ +PASSWORD_MONGODB = "" +URL_MONGODB = "" +PWD_EMAIL = "" +EMAIL = " +SMTP_SERVER = "" +SMTP_PORT = "" +EMAIL_SCHOOL = "" +LINK_SCHOOL_TIME = "" +FILE_DIRECTORY_SCHOOL = "" \ No newline at end of file diff --git a/.gitignore b/.gitignore index e9a6cb8..afd3264 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ test.xlsx -.env \ No newline at end of file +.env +geckodriver.log \ No newline at end of file diff --git a/src/events/school_time/check_email.py b/src/events/school_time/check_email.py deleted file mode 100644 index 068d263..0000000 --- a/src/events/school_time/check_email.py +++ /dev/null @@ -1,40 +0,0 @@ -import imaplib -import email -import traceback -import os -from dotenv import load_dotenv - -load_dotenv() -FROM_PWD = os.getenv('PWD_EMAIL') -FROM_EMAIL = os.getenv('EMAIL') -SMTP_SERVER = os.getenv('SMTP_SERVER') -SMTP_PORT = os.getenv('SMTP_PORT') - -def read_email_from_gmail(): - try: - mail = imaplib.IMAP4_SSL(SMTP_SERVER) - mail.login(FROM_EMAIL,FROM_PWD) - mail.select('inbox') - - data = mail.search(None, 'ALL') - mail_ids = data[1] - id_list = mail_ids[0].split() - first_email_id = int(id_list[0]) - latest_email_id = int(id_list[-1]) - - for i in range(latest_email_id,first_email_id, -1): - data = mail.fetch(str(i), '(RFC822)' ) - for response_part in data: - arr = response_part[0] - if isinstance(arr, tuple): - msg = email.message_from_string(str(arr[1],'utf-8')) - email_subject = msg['subject'] - email_from = msg['from'] - print('From : ' + email_from + '\n') - print('Subject : ' + email_subject + '\n') - - except Exception as e: - traceback.print_exc() - print(str(e)) - -read_email_from_gmail() \ No newline at end of file diff --git a/src/events/school_time/scraping_excelfile.py b/src/events/school_time/scraping_excelfile.py new file mode 100644 index 0000000..6ec2680 --- /dev/null +++ b/src/events/school_time/scraping_excelfile.py @@ -0,0 +1,45 @@ +import subprocess +import os +import tabula +from dotenv import load_dotenv +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +load_dotenv() +LINK_SCHOOL_TIME = os.getenv('LINK_SCHOOL_TIME') +FILE_DIRECTORY_SCHOOL = os.getenv('FILE_DIRECTORY_SCHOOL') +options = Options() +options.add_argument("--headless") +options.add_argument('--disable-gpu') +options.add_argument('--disable-software-rasterizer') + +driver = webdriver.Firefox(options=options) +#url launch +driver.get(LINK_SCHOOL_TIME) +#identify link with partial link text + +elems = driver.find_elements(By.XPATH, "/html/body/section[2]/div/div/main/div/div/div/div/div[2]/p[2]/a") + +for elem in elems: + link = elem.get_attribute("href") + +remove_things_in_front = link.split(FILE_DIRECTORY_SCHOOL, 1)[1] +print(remove_things_in_front) +subprocess.run(["wget", link]) + +driver.close() + +namefile = remove_things_in_front +df = tabula.read_pdf(namefile, pages = 'all')[0] +tabula.convert_into(namefile, "test.csv", output_format="csv", pages='all') +print(df) + +from pyexcel.cookbook import merge_all_to_a_book +# import pyexcel.ext.xlsx # no longer required if you use pyexcel >= 0.2.2 +import glob + + +merge_all_to_a_book(glob.glob("*.csv"), "school_time.xlsx") \ No newline at end of file diff --git a/src/events/school_time/update_time_school.py b/src/events/school_time/update_time_school.py index 63b59db..0e6d1dd 100644 --- a/src/events/school_time/update_time_school.py +++ b/src/events/school_time/update_time_school.py @@ -19,7 +19,7 @@ collection_archive = database["archive-school-time-table"] x = collection.delete_many({}) #using read_excel() method to read our excel file and storing the same in the variable named "df " -workbook = xl.load_workbook(filename="test.xlsx") +workbook = xl.load_workbook(filename="school_time.xlsx") ws = workbook.active @@ -63,19 +63,19 @@ for row in range (1, 100): } ) else: - remove_things_in_front = school_subject.split(' ', 1)[1] + #remove_things_in_front = school_subject.split(' ', 1)[1] find_document_username = list(collection.find({}, {"Date": long_date})) array_username = find_document_username[0]["_id"] collection.update_one( { "_id": ObjectId(array_username)}, { - "$push": { "School Subject": str(remove_things_in_front) } + "$push": { "School Subject": school_subject } } ) collection_archive.update_one( { "_id": ObjectId(array_username)}, { - "$push": { "School Subject": str(remove_things_in_front) } + "$push": { "School Subject": school_subject } } ) @@ -112,4 +112,6 @@ for row in range (1, 100): { "$push": { "Teacher": teacher } } - ) \ No newline at end of file + ) + +os.remove("school_time.xlsx") \ No newline at end of file