image.png
#coding=utf-8
import re,urllib2,os,urllib,requests,json,cPickle,time
def getHtmlCode(url):
response = urllib2.urlopen(url)
return response.read()
def getUrls(htmlString):
regUrl = re.compile("href=\"\/\/www.ergengtv.com\/video\/(.+?).html")
return regUrl.findall(htmlString)
def getTitle(htmlString):
regTitle = re.compile("title\": \"(.+?)\",")
return regTitle.findall(htmlString)
def getMediaId(htmlString):
regMediaId = re.compile("media_id\": (.+?),")
return regMediaId.findall(htmlString)
def getCreateTime(htmlString):
regCreateTime = re.compile("create_at\": (.+?),")
return regCreateTime.findall(htmlString)
if __name__ == '__main__':
fileMediaId = open('mediaIds.pkl', 'rb')
mediaIdSaved = cPickle.load(fileMediaId)
fileMediaId.close()
print len(mediaIdSaved)
for i in range(27,119):
url = 'https://www.ergengtv.com/video/list/0_' + str(i) + '.html'
htmlCode = getHtmlCode(url)
urls = getUrls(htmlCode)
urls = list(set(urls))
for urlId in urls:
url2 = 'https://www.ergengtv.com/video/' + urlId + '.html'
htmlCode2 = getHtmlCode(url2)
createTimes = getCreateTime(htmlCode2)
timeString = time.localtime(float(createTimes[0]))
createTime = time.strftime('%Y-%m-%d',timeString)
titles = getTitle(htmlCode2)
mediaIds = getMediaId(htmlCode2)
mediaId = mediaIds[0]
fileName = createTime + '--' + titles[0]
print fileName
if mediaId in mediaIdSaved:
print 'exsied--------------> '
else:
try:
apiUrl = 'https://member.ergengtv.com/api/video/vod/?id=' + mediaIds[0]
htmlCode3 = getHtmlCode(apiUrl)
decodeJson = json.loads(htmlCode3)
downloadUrl = decodeJson["msg"]["segs"]["1080p"][0]["url"]
downloadUrl = downloadUrl.replace('http', 'https')
urllib.urlretrieve(downloadUrl,"%s.mp4" %("videos/" + fileName))
print 'done'
mediaIdSaved.append(mediaId)
fileMediaId = open('mediaIds.pkl', 'wb')
cPickle.dump(mediaIdSaved,fileMediaId,protocol=01)
fileMediaId.close()
except:
print "error"