先用sqlite做一个版本,sqlite是一个关系型的嵌入式数据库,可以支持复杂的数据库逻辑,但是,相对的效率也不是很nice,而这里要用到的数据逻辑并不复杂,因此可以使用KV数据库,改进版本是使用了Redies 这一Nosql数据库,但Redies需要在使用机上开启监听端口,因此也不是很适合,最后改用了LevelDB,效率较sqlite提高了20倍左右。
__author__ = 'glcsnz123' # -*- coding: utf-8 -*- import time,os,datetime import thread,sys,signal import sqlite3 import logging,atexit import upyun import Queue class EmptyLogger: def error(self,st): pass def debug(self,st): pass def info(self,st): pass def warning(self,st): pass #------------LOG CONFIG----------------- LOGGER = EmptyLogger() __LOGLEVEL = logging.INFO LOGFILE = "/tmp/UpYunSync.log" #-------------------------------------- def InitLogger(): global LOGGER,LOGFILE,__LOGLEVEL if __LOGLEVEL == logging.CRITICAL or not os.access(os.path.dirname(LOGFILE),os.W_OK): LOGGER = EmptyLogger() return LOGGER = logging.getLogger() hdlr = logging.FileHandler(LOGFILE) formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') hdlr.setFormatter(formatter) LOGGER.addHandler(hdlr) LOGGER.setLevel(__LOGLEVEL) def SetLoggerLevel(loglevel=""):#DEBUG < INFO < WARNING < ERROR < OFF global __LOGLEVEL if loglevel.upper() == "INFO": LOGLEVEL = logging.INFO elif loglevel.upper() == "WARNING": LOGLEVEL = logging.WARNING elif loglevel.upper() == "ERROR": LOGLEVEL = logging.ERROR elif loglevel.upper() == "DEBUG": LOGLEVEL = logging.DEBUG elif loglevel.upper() == "OFF": LOGLEVEL = logging.CRITICAL else: LOGLEVEL = logging.NOTSET def getLastModifyTime(pathname):#获取最后一次修改时间 if os.path.isfile(pathname) or os.path.isdir(pathname): #如果存在该文件的话 return (datetime.datetime.fromtimestamp(os.path.getmtime(pathname))).strftime("%F %X") return (datetime.datetime.fromtimestamp(0)).strftime("%F %X") class UpYunUprSync: def __init__(self,BUCKETNAME='uprsyncx',USERNAME='admin',PASSWORD='adminadmin',SRC_DIR="/home/glcsnz123/Django-1.5.4",DST_DIR="/zlssasj",DBFILENAME=".UpYunsqlite.db"): if SRC_DIR.endswith(os.sep): SRC_DIR = SRC_DIR[0:-1:1] if DST_DIR.endswith(os.sep): DST_DIR = DST_DIR[0:-1:1] self.__BUCKETNAME = BUCKETNAME self.__USERNAME = USERNAME self.__PASSWORD = PASSWORD self.__SRC_DIR = SRC_DIR self.__DST_DIR = DST_DIR self.__HEADERS = {"x-gmkerl-rotate": "180"} self.__DBFILENAME = DBFILENAME self.__WORKER_LIMIT = 10 self.__ErrorFileList = [] self.__mkdirList = [] # 初始化sqlites self.__sqlites = UpYunsqlite(self.__SRC_DIR,self.__DBFILENAME) # daemon 配置信息 self.pidfile = "/tmp/uprsync.pid" self.stdout = "/dev/null" self.stderr = "/dev/null" def setThreadNumLimit(self,T_Limit): self.__WORKER_LIMIT = max(1,T_Limit) def __getFileList(self,fpath): start = datetime.datetime.now() if fpath.endswith(os.sep): pathStack = [fpath[0:-1:1]] else: pathStack = [fpath] self.__DFS_FINISHED = False self.__JobFileList = Queue.Queue(100000) #用来存放需要上传的文件的路径 self.__JobPathList = Queue.Queue() #用来存放需要创建的目录 dirList = [] #初始化目录 try: tmpList = os.listdir(fpath) except OSError,e: print "[ERROR]Permission Denied!\n " + fpath,"\n" LOGGER.error("[ERROR]Permission Denied!\n " + fpath + "\n\n") self.__DFS_FINISHED = True return if self.__DST_DIR != "": self.__CilentUp.mkdir(self.__DST_DIR) currentDir = os.sep.join(pathStack) sqlPathList = self.__sqlites.getPathFromsql(currentDir) sqlFileList = self.__sqlites.getFileFromsql(currentDir) for filename in tmpList: fullname = os.sep.join(pathStack) + os.sep + filename if os.path.isdir(fullname): if sqlPathList.has_key(filename) == False: self.__JobPathList.put(fullname) dirList.append(filename) elif os.path.islink(fullname): print "[WARNING]file:",fullname,"is a symbol link file\n" LOGGER.warning(fullname.join(["[WARNING]file: "," is a symbol link file\n\n"])) elif filename == self.__DBFILENAME:#数据库文件,不做任何处理! continue else: if sqlFileList.has_key(filename) == False: res = 1 elif sqlFileList[filename] == getLastModifyTime(fullname): print "[DEBUG]file:"," is not modified\n" LOGGER.debug(fullname.join(["[DEBUG]file: "," is not modified\n\n"])) continue else: res = 0 self.__JobFileList.put((fullname,res),block=True) while dirList.__len__() > 0: if dirList[-1] == "": pathStack.pop() dirList.pop() continue try: tmpList = os.listdir(os.sep.join(pathStack) + os.sep + dirList[-1]) except: print "[ERROR]Permission Denied!\n" + os.sep.join(pathStack) + os.sep + dirList[-1] + "\n" LOGGER.error("[ERROR]Permission Denied!\n" + os.sep.join(pathStack) + os.sep + dirList[-1] + "\n\n") continue pathStack.append(dirList[-1]) dirList.pop() dirList.append("") currentDir = os.sep.join(pathStack) sqlPathList = self.__sqlites.getPathFromsql(currentDir) sqlFileList = self.__sqlites.getFileFromsql(currentDir) for filename in tmpList: fullname = os.sep.join([currentDir,filename]) if os.path.isdir(fullname): if sqlPathList.has_key(filename) == False: self.__JobPathList.put(fullname) dirList.append(filename) elif os.path.islink(fullname): print "[WARNING]file:"," is a symbol link file!\n" LOGGER.warning(fullname.join(["[WARNING]file: "," is a symbol link file\n\n"])) else: if sqlFileList.has_key(filename) == False: res = 1 elif sqlFileList[filename] == getLastModifyTime(fullname): print "[DEBUG]file:",block=True) #此处代表已经完成了对目录的遍历工作,标记 self.__DFS_FINISHED = True print "[INFO] Finish the dfs after",(datetime.datetime.now() - start).seconds,"s\n" LOGGER.info("[INFO] Finish the dfs after " + (datetime.datetime.now() - start).seconds.__str__() + " s\n\n") def __FileSync(self,fpath): try: self.__CilentUp.put("".join([self.__DST_DIR,fpath[self.__SRC_DIR.__len__():]]),open(fpath).read()) print "".join([self.__DST_DIR,fpath[self.__SRC_DIR.__len__():]]) + " oked\n" LOGGER.debug("".join([self.__DST_DIR,fpath[self.__SRC_DIR.__len__():]]) + " oked\n\n") return True except upyun.UpYunClientException as ce: self.__ErrorFileList.append(fpath) print "".join([self.__DST_DIR,fpath[self.__SRC_DIR.__len__():]]) + " Failed!\n" print "Except an UpYunClientException ..." print "Error Message: " + ce.msg + "\n" LOGGER.error("\n".join(["".join([self.__DST_DIR,fpath[self.__SRC_DIR.__len__():]]) + " Failed!\n","Except an UpYunClientException ...","Error Message: " + ce.msg + "\n\n"])) return False except upyun.UpYunServiceException as se: self.__ErrorFileList.append(fpath) print "".join([self.__DST_DIR,fpath[self.__SRC_DIR.__len__():]]) + " Failed\n" print "Except an UpYunServiceException ..." print "HTTP Status Code: " + str(se.status) print "Error Message: " + se.msg + "\n" LOGGER.error("\n".join(["".join([self.__DST_DIR,fpath[self.__SRC_DIR.__len__():]]) + " Failed\n","Except an UpYunServiceException ...\nHTTP Status Code: " + str(se.status),"Error Message: " + se.msg + "\n\n"])) if se.err: print se.err LOGGER.error(se.err + "\n\n") return False def __Worker(self,pid): try: waiting = 1 while True: try: if self.__DFS_FINISHED == False: self.__mkdirList[pid] = self.__JobPathList.get(block=True,timeout=5) else: self.__mkdirList[pid] = self.__JobPathList.get(block=False) fpath = self.__mkdirList[pid] while True: flag = 0 for i in range(self.__WORKER_LIMIT): if i != pid and self.__mkdirList[i] != "" and fpath.startswith(self.__mkdirList[i]): flag = 1 break if flag == 0: break else: time.sleep(2) self.__CilentUp.mkdir("".join([self.__DST_DIR,fpath[self.__SRC_DIR.__len__():]])) self.__mkdirList[pid] = "" self.__sqlites.insertPathTosql(fpath) continue except Queue.Empty,efr: pass except Exception,ex: pass try: fpath = self.__JobFileList.get(block=True,timeout=waiting) except Exception,e: if self.__DFS_FINISHED: return else: waiting = min(waiting * 2,30) continue waiting = max(waiting/2,1) if os.access(fpath[0],os.R_OK) == False: self.__ErrorFileList.append(fpath[0]) print fpath[0].join(["[ERROR] "," Permission Denied!Need Read access\n\n"]) LOGGER.error(fpath[0].join(["[ERROR] "," Permission Denied!Need Read access\n\n"])) continue res = self.__FileSync(fpath[0]) #if sync success,update the datebase. if res == True: if fpath[1] == 1: self.__sqlites.insertFileTosql(fpath[0]) elif fpath[1] == 0: self.__sqlites.updateFileTosql(fpath[0]) else: print "==============ERROR===================" except Exception,e: print "[WARNING] a thread died.",e,"\n" LOGGER.warning("[WARNING] a thread died." + e.__str__() + "\n\n") finally: self.__WORKER_NOW -= 1 self.__mkdirList[pid]="" def __InitLogIn(self): self.__CilentUp = upyun.UpYun(self.__BUCKETNAME,self.__USERNAME,self.__PASSWORD,timeout=30,endpoint=upyun.ED_AUTO) def runMultiThreadSync(self): start = datetime.datetime.now() self.__WORKER_NOW = self.__WORKER_LIMIT self.__mkdirList = [""] * self.__WORKER_LIMIT self.__InitLogIn() thread.start_new_thread(self.__getFileList,(self.__SRC_DIR,)) time.sleep(3) for i in range(self.__WORKER_LIMIT): thread.start_new_thread(self.__Worker,(i,)) while self.__WORKER_NOW > 0: while self.__WORKER_NOW < self.__WORKER_LIMIT and (self.__JobPathList.qsize()>0 or self.__JobFileList.qsize()>0): thread.start_new_thread(self.__Worker,()) self.__WORKER_NOW += 1 self.__mkdirList.append("") print "[INFO] Create a new Thread! \n" LOGGER.info("[INFO] Create a new Thread! \n\n") # time.sleep(20) for i in range(4): print self.__JobFileList.qsize(),self.__JobPathList.qsize() LOGGER.debug("[INFO]" + str(self.__JobFileList.qsize()) + " files are found and waiting for sync.") time.sleep(5) self.RollBack() print "[INFO]Finish uprsync after " + (datetime.datetime.now() - start).seconds.__str__() + " s\n" LOGGER.info("[INFO]Finish uprsync after " + (datetime.datetime.now() - start).seconds.__str__() + " s\n\n") def RollBack(self): for fullname in self.__ErrorFileList: print fullname.join(["[WARNING] "," is rolling back!"]) LOGGER.warning(fullname.join(["[WARNING] "," is rolling back!"])) pathname = os.path.dirname(fullname)[self.__SRC_DIR.__len__():] while os.path.basename(pathname) != "": self.__sqlites.rollBackPathTosql(pathname) pathname = os.path.dirname(pathname) #Daemon ------------------------------------------ def __daemonize(self): try: pid = os.fork() if pid > 0: sys.exit(0) except OSError,ose: sys.stderr.write("[Daemon ERROR]fork #1 Failed: %d (%s)\n" % (ose.errno,ose.strerror)) LOGGER.error("[Daemon ERROR]fork #1 Failed: %d (%s)\n\n" % (ose.errno,ose.strerror)) sys.exit(1) #脱离终端 os.setsid() #修改当前工作目录 os.chdir("/") #重设文件创建权限 os.umask(0) #第二次fork,禁止进程重新打开控制终端 try: pid = os.fork() if pid > 0: sys.exit(0) except OSError,e: sys.stderr.write("[Daemon ERROR]fork #2 Failed: %d (%s)\n" % (e.errno,e.strerror)) LOGGER.error("[Daemon ERROR]fork #2 Failed: %d (%s)\n\n" % (e.errno,e.strerror)) sys.exit(1) sys.stdout.flush() sys.stderr.flush() so = file(self.stdout,'a+') se = file(self.stderr,'a+',0) #重定向标准输出/错误 os.dup2(so.fileno(),sys.stdout.fileno()) os.dup2(se.fileno(),sys.stderr.fileno()) #注册程序退出时的函数,即删掉pid文件 atexit.register(self.__delpid) pid = str(os.getpid()) file(self.pidfile,'w+').write("%s\n" % pid) def __delpid(self): os.remove(self.pidfile) def start(self): """ Start the daemon """ print "[Deamon DEBUG]start in Daemon\n" LOGGER.debug("[Daemon DEBUG]start in Daemon\n\n") # Check for a pidfile to see if the daemon already runs try: pf = file(self.pidfile,'r') pid = int(pf.read().strip()) pf.close() except IOError: pid = None if pid: message = "pidfile %s already exist. Daemon already running?\n\n" sys.stderr.write(message % self.pidfile) LOGGER.error("[Daemon ERROR]pidfile %s already exist. Daemon already running?\n\n" % self.pidfile) sys.exit(1) # Start the daemon self.__daemonize() self.__run() def stop(self): """ Stop the daemon """ # Get the pid from the pidfile print "[Deamon DEBUG]stop in Daemon\n" LOGGER.debug("[Daemon DEBUG]stop in Daemon\n\n") try: pf = file(self.pidfile,'r') pid = int(pf.read().strip()) pf.close() except IOError: pid = None if not pid: message = "pidfile %s does not exist. Daemon not running?\n" sys.stderr.write(message % self.pidfile) LOGGER.error("[Daemon ERROR]pidfile %s does not exist. Daemon not running?\n\n" % self.pidfile) return # not an error in a restart # Try killing the daemon process try: while True: os.kill(pid,signal.SIGTERM) time.sleep(0.1) except OSError,err: err = str(err) if err.find("No such process") > 0: if os.path.exists(self.pidfile): os.remove(self.pidfile) else: print str(err) sys.exit(1) def restart(self): """ Restart the daemon """ self.stop() self.start() def __run(self): self.runMultiThreadSync() #---------------------------- DB ---------------------------------------- class UpYunsqlite: """ 元数据的操作 """ def __init__(self,SRC_DIR="/home/glcsnz123",DBFILENAME="example.db"): self.SRC_DIR = SRC_DIR self.DBFILENAME = DBFILENAME self.__InitConnect() self.locksql = thread.allocate_lock() def __InitDBFile(self): if not os.access(self.SRC_DIR,os.W_OK): print "[ERROR]No write access in current directory" LOGGER.error("[ERROR]No write access in current directory") sys.exit("403") conn = sqlite3.connect(os.sep.join([self.SRC_DIR,self.DBFILENAME])) cur = conn.cursor() cur.execute("CREATE TABLE FileModify(id INTEGER PRIMARY KEY AUTOINCREMENT,filename VARCHAR(256),\ pathname VARCHAR(256),last_modify DATE)") cur.execute("CREATE TABLE PathModify(id INTEGER PRIMARY KEY AUTOINCREMENT,pathname VARCHAR(256),\ fatherpath VARCHAR(256),last_modify DATE)") cur.close() conn.close() def __InitConnect(self): if os.path.isfile(os.sep.join([self.SRC_DIR,self.DBFILENAME])) == False: self.__InitDBFile() self.CONN = sqlite3.connect(os.sep.join([self.SRC_DIR,self.DBFILENAME]),check_same_thread=False) self.CUR = self.CONN.cursor() def getPathFromsql(self,fapath): query = "select pathname,last_modify from PathModify where fatherpath='%s'" % ( self.__rpQuota(fapath[self.SRC_DIR.__len__():])) try: self.locksql.acquire() self.CUR.execute(query) res = self.CUR.fetchall() finally: self.locksql.release() resObj = {} for i in range(res.__len__()): resObj[res[i][0]] = res[i][1] return resObj def getFileFromsql(self,fapath): query = "select filename,last_modify from FileModify where pathname = '%s'" % ( self.__rpQuota(fapath[self.SRC_DIR.__len__():])) try: self.locksql.acquire() self.CUR.execute(query) res = self.CUR.fetchall() finally: self.locksql.release() resObj = {} for i in range(res.__len__()): resObj[res[i][0]] = res[i][1] return resObj def updateFileTosql(self,fpath): query = r"update FileModify SET last_modify = '%s' where filename = '%s' AND pathname = '%s'" % ( getLastModifyTime(fpath),self.__rpQuota(os.path.basename(fpath)),self.__rpQuota(os.path.dirname(fpath)[self.SRC_DIR.__len__():])) try: self.locksql.acquire() self.CUR.execute(query) self.CONN.commit() except sqlite3.OperationalError,soe: print "[Error] Syntax error!\nError Query: " + query,"\nError Path: " + fpath + '\n' LOGGER.error("[Error] Syntax error!\nError Query: " + query + "\nError Path: " + fpath + '\n\n') finally: self.locksql.release() def insertFileTosql(self,fpath): query = r"insert into FileModify(filename,pathname,last_modify) values('%s','%s','%s')" % ( self.__rpQuota(os.path.basename(fpath)),self.__rpQuota(os.path.dirname(fpath)[self.SRC_DIR.__len__():]),getLastModifyTime(fpath)) try: self.locksql.acquire() self.CUR.execute(query) self.CONN.commit() except sqlite3.OperationalError,"\nError Path: " + fpath + '\n' LOGGER.error("[Error] Syntax error!\nError Query: " + query + "\nError Path: " + fpath + '\n\n') finally: self.locksql.release() def updatePathTosql(self,pathname): query = r"update PathModify SET last_modify = '%s' where pathname = '%s' and fatherpath='%s'" % ( getLastModifyTime(pathname),self.__rpQuota(os.path.basename(pathname)),self.__rpQuota(os.path.dirname(pathname)[self.SRC_DIR.__len__():])) try: self.locksql.acquire() self.CUR.execute(query) self.CONN.commit() except sqlite3.OperationalError,soe: print "[Error] Syntax error!\n Error Query: " + query,"\nError Path: " + pathname + "\n" LOGGER.error("[Error] Syntax error!\n Error Query: " + query + "\nError Path: " + pathname + "\n\n") finally: self.locksql.release() def rollBackPathTosql(self,pathname): query = r"update PathModify SET last_modify = '%s' where pathname = '%s' and fatherpath='%s'" % ( getLastModifyTime(""),self.__rpQuota(os.path.dirname(pathname))) try: self.locksql.acquire() self.CUR.execute(query) self.CONN.commit() except sqlite3.OperationalError,"\nError Path: " + pathname + "\n" LOGGER.error("[Error] Syntax error!\n Error Query: " + query + "\nError Path: " + pathname + "\n\n") finally: self.locksql.release() def insertPathTosql(self,pathname): query = r"insert into PathModify(pathname,fatherpath,'%s')" % ( self.__rpQuota(os.path.basename(pathname)),self.__rpQuota(os.path.dirname(pathname)[self.SRC_DIR.__len__():]),getLastModifyTime(pathname)) try: self.locksql.acquire() self.CUR.execute(query) self.CONN.commit() except sqlite3.OperationalError,"\nError Path: " + pathname + "\n" LOGGER.error("[Error] Syntax error!\n Error Query: " + query + "\nError Path: " + pathname + "\n\n") finally: self.locksql.release() def __rpQuota(self,st): return st.replace("'",r"//") if __name__ == "__main__": InitLogger() ups = UpYunUprSync() ups.setThreadNumLimit(20) ups.runMultiThreadSync() ############################### # 问题主要出在了数据库的询问上,所以必须减少数据库的访问 # getFileList函数主要的花费是在mkdir和数据库操作上。,数据库操作占了50%的花费 # mkdir大概花了25%的时间代价 #
ReadMe.txt 基本函数接口 初始化日志: InitLogger() 默认日志等级为INFO 设置日志等级 SetLoggerLevel(log_level) 参数log_level 为日志的等级 可选日志等级有(不区分大小写): "DEBUG" "INFO" "WARNING" "ERROR" "OFF" 其中OFF表示关闭日志功能,另外,如果未初始化日志,日志功能默认是关闭的。 日志的存放地址默认为/tmp/UpYunSync.log。同时,也可通过以下方法来设定日志文件地址: UpYunrSync.LOGFILE = LOG_FILE 其中参数LOG_FILE 为日志文件的地址 初始化UpYunrSync import UpYunrSync ups = UpYunrSync.UpYunUprSync(BUCKETNAME,USERNAME,PASSWORD,SRC_DIR,DST_DIR,DBFILENAME) 其中参数 bucket 为空间名称,username 和 password 分别为授权操作员帐号和密码,SRC_DIR和DST_DIR分别为需要同步的本地目录和服务器目录,必选。 参数DBFILENAME为存储本地文件元数据的sqlite文件,默认值为.UpYunsqlite.db 设置线程开启个数 ups.setThreadNumLimit(Thread_Num) 参数Thread_Num为上传文件的线程个数,线程个数并不是越多越好,应当根据所要上传的目录中文件个数以及大小来确定。 同步目录 终端直接同步文件形式 ups.runMultiThreadSync() 执行文件同步的操作 后台守护进程同步文件形式 ups.start() ups.stop() ups.restart() 三个方法分别是启动进程、停止进程和重启进程。 后台进程模式下,分别设置程序标准输出、错误输出和PID文件位置的方法如下: ups.stdout = Stdout_File ups.stderr = Stderr_File ups.pidfile = Pid_File 其中,参数Stdout_File 和 Stderr_File 分别是标准输出和错误输出的文件地址,默认值为/dev/null。Pid_File为进程的PID文件位置,默认值为/tmp/uprsync.pid原文链接:https://www.f2er.com/sqlite/201011.html