Weird problem matching with REs
Andrew Berg
bahamutzero8825 at gmail.com
Sun May 29 07:45:30 EDT 2011
I have an RE that should work (it even works in Kodos [1], but not in my
code), but it keeps failing to match characters after a newline.
I'm writing a little program that scans the webpage of an arbitrary
application and gets the newest version advertised on the page.
test3.py:
> # -*- coding: utf-8 -*-
>
> import configparser
> import re
> import urllib.request
> import os
> import sys
> import logging
> import collections
>
>
> class CouldNotFindVersion(Exception):
> def __init__(self, app_name, reason, exc_value):
> self.value = 'The latest version of ' + app_name + ' could not
> be determined because ' + reason
> self.cause = exc_value
> def __str__(self):
> return repr(self.value)
>
> class AppUpdateItem():
> def __init__(self, config_file_name, config_file_section):
> self.section = config_file_section
> self.name = self.section['Name']
> self.url = self.section['URL']
> self.filename = self.section['Filename']
> self.file_re = re.compile(self.section['FileURLRegex'])
> self.ver_re = re.compile(self.section['VersionRegex'])
> self.prev_ver = self.section['CurrentVersion']
> try:
> self.page = str(urllib.request.urlopen(self.url).read(),
> encoding='utf-8')
> self.file_URL = self.file_re.findall(self.page)[0] #here
> is where it fails
> self.last_ver = self.ver_re.findall(self.file_URL)[0]
> except urllib.error.URLError:
> self.error = str(sys.exc_info()[1])
> logging.info('[' + self.name + ']' + ' Could not load URL:
> ' + self.url + ' : ' + self.error)
> self.success = False
> raise CouldNotFindVersion(self.name, self.error,
> sys.exc_info()[0])
> except IndexError:
> logging.warning('Regex did not return a match.')
> def update_ini(self):
> self.section['CurrentVersion'] = self.last_ver
> with open(config_file_name, 'w') as configfile:
> config.write(configfile)
> def rollback_ini(self):
> self.section['CurrentVersion'] = self.prev_ver
> with open(config_file_name, 'w') as configfile:
> config.write(configfile)
> def download_file(self):
> self.__filename = self.section['Filename']
> with open(self.__filename, 'wb') as file:
> self.__file_req = urllib.request.urlopen(self.file_URL).read()
> file.write(self.__file_req)
>
>
> if __name__ == '__main__':
> config = configparser.ConfigParser()
> config_file = 'checklist.ini'
> config.read(config_file)
> queue = collections.deque()
> for section in config.sections():
> try:
> queue.append(AppUpdateItem(config_file, config[section]))
> except CouldNotFindVersion as exc:
> logging.warning(exc.value)
> for elem in queue:
> if elem.last_ver != elem.prev_ver:
> elem.update_ini()
> try:
> elem.download_file()
> except IOError:
> logging.warning('[' + elem.name + '] Download failed.')
> except:
> elem.rollback_ini()
> print(elem.name + ' succeeded.')
checklist.ini:
> [x264_64]
> name = x264 (64-bit)
> filename = x264.exe
> url = http://x264.nl/x264_main.php
> fileurlregex =
> http://x264.nl/x264/64bit/8bit_depth/revision\n{0,3}[0-9]{4}\n{0,3}/x264\n{0,3}.exe
> versionregex = [0-9]{4}
> currentversion = 1995
The part it's supposed to match in http://x264.nl/x264_main.php:
> <a href="http://x264.nl/x264/64bit/8bit_depth/revision
> 1995
> /x264
>
> .exe <view-source-tab:http://x264.nl/x264/64bit/8bit_depth/revision%0A1995%0A/x264%0A%0A.exe>"
I was able to make a regex that matches in my code, but it shouldn't:
http://x264.nl/x264/64bit/8bit_depth/revision.\n{1,3}[0-9]{4}.\n{1,3}/x264.\n{1,3}.\n{1,3}.exe
I have to add a dot before each "\n". There is no character not
accounted for before those newlines, but I don't get a match without the
dots. I also need both those ".\n{1,3}" sequences before the ".exe". I'm
really confused.
Using Python 3.2 on Windows, in case it matters.
[1] http://kodos.sourceforge.net/ (using the compiled Win32 version
since it doesn't work with Python 3)
More information about the Python-list
mailing list