use ..utils/clean_html()

This commit is contained in:
huohuarong 2013-08-03 10:29:58 +08:00
parent 6624a2b07d
commit 4ec929dc9b
1 changed files with 6 additions and 13 deletions

View File

@ -7,7 +7,7 @@ import logging
import urllib2 import urllib2
from .common import InfoExtractor from .common import InfoExtractor
from ..utils import compat_urllib_request from ..utils import compat_urllib_request, clean_html
class SohuIE(InfoExtractor): class SohuIE(InfoExtractor):
@ -22,16 +22,6 @@ class SohuIE(InfoExtractor):
}, },
} }
def _clearn_html(self, string):
tags = re.findall(r'<.+?>', string)
for t in tags:
string = string.replace(t, ' ')
for i in range(2):
spaces = re.findall(r'\s+', string)
for s in spaces:
string = string.replace(s, ' ')
string = string.strip()
return string
def _real_extract(self, url): def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url) mobj = re.match(self._VALID_URL, url)
@ -40,7 +30,7 @@ class SohuIE(InfoExtractor):
pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>' pattern = r'<h1 id="video-title">\n*?(.+?)\n*?</h1>'
compiled = re.compile(pattern, re.DOTALL) compiled = re.compile(pattern, re.DOTALL)
title = self._search_regex(compiled, webpage, u'video title').strip('\t\n') title = self._search_regex(compiled, webpage, u'video title').strip('\t\n')
title = self._clearn_html(title) title = clean_html(title)
pattern = re.compile(r'var vid="(\d+)"') pattern = re.compile(r'var vid="(\d+)"')
result = re.search(pattern, webpage) result = re.search(pattern, webpage)
if not result: if not result:
@ -93,5 +83,8 @@ class SohuIE(InfoExtractor):
} }
files_info.append(info) files_info.append(info)
time.sleep(1) time.sleep(1)
if num_of_parts == 1:
info = files_info[0]
info['id'] = video_id
return info
return files_info return files_info