Now I can download and convert the file automatically, but the automatic generation of the school timetable doesn't work correctly at the moment

2025-06-06 00:39:12 +02:00 · 2023-01-04 16:33:28 +00:00
parent 03ce9b49e8
commit bced0ee355
5 changed files with 63 additions and 46 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,9 @@
+PASSWORD_MONGODB = ""
+URL_MONGODB = ""
+PWD_EMAIL = ""
+EMAIL = "
+SMTP_SERVER = ""
+SMTP_PORT = ""
+EMAIL_SCHOOL = ""
+LINK_SCHOOL_TIME = ""
+FILE_DIRECTORY_SCHOOL = ""
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 test.xlsx
 .env
+geckodriver.log
--- a/src/events/school_time/check_email.py
+++ b/src/events/school_time/check_email.py
@@ -1,40 +0,0 @@
-import imaplib
-import email
-import traceback 
-import os
-from dotenv import load_dotenv
-
-load_dotenv()
-FROM_PWD = os.getenv('PWD_EMAIL')
-FROM_EMAIL = os.getenv('EMAIL')
-SMTP_SERVER = os.getenv('SMTP_SERVER')
-SMTP_PORT = os.getenv('SMTP_PORT')
-
-def read_email_from_gmail():
-    try:
-        mail = imaplib.IMAP4_SSL(SMTP_SERVER)
-        mail.login(FROM_EMAIL,FROM_PWD)
-        mail.select('inbox')
-
-        data = mail.search(None, 'ALL')
-        mail_ids = data[1]
-        id_list = mail_ids[0].split()   
-        first_email_id = int(id_list[0])
-        latest_email_id = int(id_list[-1])
-
-        for i in range(latest_email_id,first_email_id, -1):
-            data = mail.fetch(str(i), '(RFC822)' )
-            for response_part in data:
-                arr = response_part[0]
-                if isinstance(arr, tuple):
-                    msg = email.message_from_string(str(arr[1],'utf-8'))
-                    email_subject = msg['subject']
-                    email_from = msg['from']
-                    print('From : ' + email_from + '\n')
-                    print('Subject : ' + email_subject + '\n')
-
-    except Exception as e:
-        traceback.print_exc() 
-        print(str(e))
-
-read_email_from_gmail()
--- a/src/events/school_time/scraping_excelfile.py
+++ b/src/events/school_time/scraping_excelfile.py
@@ -0,0 +1,45 @@
+import subprocess
+import os
+import tabula
+from dotenv import load_dotenv
+from selenium import webdriver
+from selenium.webdriver.firefox.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+load_dotenv()
+LINK_SCHOOL_TIME = os.getenv('LINK_SCHOOL_TIME')
+FILE_DIRECTORY_SCHOOL = os.getenv('FILE_DIRECTORY_SCHOOL')
+options = Options()
+options.add_argument("--headless")
+options.add_argument('--disable-gpu')
+options.add_argument('--disable-software-rasterizer')
+
+driver = webdriver.Firefox(options=options)
+#url launch
+driver.get(LINK_SCHOOL_TIME)
+#identify link with partial link text
+
+elems = driver.find_elements(By.XPATH, "/html/body/section[2]/div/div/main/div/div/div/div/div[2]/p[2]/a")
+
+for elem in elems:
+    link = elem.get_attribute("href")
+
+remove_things_in_front = link.split(FILE_DIRECTORY_SCHOOL, 1)[1]
+print(remove_things_in_front)
+subprocess.run(["wget", link])
+
+driver.close()
+
+namefile = remove_things_in_front
+df = tabula.read_pdf(namefile, pages = 'all')[0]
+tabula.convert_into(namefile, "test.csv", output_format="csv", pages='all')
+print(df)
+
+from pyexcel.cookbook import merge_all_to_a_book
+# import pyexcel.ext.xlsx # no longer required if you use pyexcel >= 0.2.2 
+import glob
+
+
+merge_all_to_a_book(glob.glob("*.csv"), "school_time.xlsx")
--- a/src/events/school_time/update_time_school.py
+++ b/src/events/school_time/update_time_school.py
@@ -19,7 +19,7 @@ collection_archive = database["archive-school-time-table"]
 x = collection.delete_many({})

 #using read_excel() method to read our excel file and storing the same in the variable named "df "
-workbook = xl.load_workbook(filename="test.xlsx")
+workbook = xl.load_workbook(filename="school_time.xlsx")

 ws = workbook.active

@@ -63,19 +63,19 @@ for row in range (1, 100):
                            }
                        )
                else:
-                    remove_things_in_front = school_subject.split(' ', 1)[1]
+                    #remove_things_in_front = school_subject.split(' ', 1)[1]
                    find_document_username = list(collection.find({}, {"Date": long_date}))
                    array_username = find_document_username[0]["_id"]
                    collection.update_one(
                        { "_id": ObjectId(array_username)},
                            {
-                                "$push": { "School Subject": str(remove_things_in_front) }
+                                "$push": { "School Subject": school_subject }
                            }
                        )
                    collection_archive.update_one(
                        { "_id": ObjectId(array_username)},
                            {
-                                "$push": { "School Subject": str(remove_things_in_front) }
+                                "$push": { "School Subject": school_subject }
                            }
                        )
                        
@@ -113,3 +113,5 @@ for row in range (1, 100):
                                "$push": { "Teacher": teacher }
                            }
                        )
+
+os.remove("school_time.xlsx")