Description: An XML gleaner for collecting weather data.

Written by: Jim Miller

The site that is queried.

An example of the XML report:

<station id="KRLD" name="Richland AP AWOS" elev="394" lat="46.3056" lon="-119.30419" provider="PDTWFO">
    <ob time="29 Apr 8:50 am PDT" utime="1430322600"><variable var="T" description="Temp" unit="F" value="57"/>
        <variable var="TD" description="Dewp" unit="F" value="32"/><variable var="RH" description="Relh" unit="%" value="38"/>
        <variable var="DD" description="Direction" unit="deg" value="240"/>
        <variable var="DDCARD" description="Wind Card" unit="direction" value="WSW"/>
        <variable var="FF" description="Wind" unit="mph" value="13"/>
        <variable var="FFGUST" description="Gust" unit="mph" value="17"/>
        <variable var="VV" description="Visibility" unit="miles" value="10.00"/>
        <variable var="WEA" description="Weather" unit="" value="Fair"/>
        <variable var="PRESWEA" description="Coded Weather" unit="" value=""/>
        <variable var="SKY" description="Clouds" unit="" value="CLR"/>
        <variable var="ALTSE" description="Altimeter" unit="inches" value="30.19"/>
        <variable var="P" description="Station Pressure" unit="inches" value="29.772"/>
        <variable var="STAQUAL" description="Station Quality" unit="" value="OK"/>

Some more example queries:

Time series, XML output (using their API and a demo token)

Time series, JSON output (using API)

Most recent record (using API)

Time series (using NOAA interface)


# Import supporting modules

# System and OS
import sys, os

# Strings
import string

# HTML fetching
import urllib2

# Windows extensions (pywin32-220.win32-py2.7.exe)
import win32com.client
import pywintypes  # supports exceptions names

# time and date functionality
import time
import datetime

# In support of XML parsing
from xml.dom.minidom import parse 


def timeA_GT_global(timeA):
    # Check to see if A is later than what's in the global. 
    # and replace the global if it is...

    # Must declare locally to change a global. If no global declaration, the
    # ASSIGNMENT statement will force this variable to be local here in this
    # function.
    global time_datetime_maxDLSdate

    if (timeA > time_datetime_maxDLSdate):
        time_datetime_maxDLSdate = timeA

def processMultipleStations_xml(station_dictionary):
    # Build the URL string to run a query for one station.
    # (returned on one page).
    # http://www.wrh.noaa.gov/mesowest/getobextXml.php?num=1&sid=KRLD
    url_base = "http://www.wrh.noaa.gov/mesowest/getobextXml.php?num=2&sid="

    station_count = 0
    write_count = 0
    # This index keeps track of successful writes that happen when attempting
    # to write observations other then the first one.
    write_count_notFirst = 0
    for station_name in station_dictionary.keys():        
        print ""
        print "station name =", station_name
        station_count += 1
        webpage_url = url_base + station_name.split(".")[0]

            # Fetch the XML page
            request = urllib2.Request(webpage_url, headers={'User-Agent': 'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'})
            page = urllib2.urlopen(request)
            message_str = "Error opening url for %s ::: %s ==> %s" % (station_name, sys.exc_type, sys.exc_value) 
            print message_str + ", URL = " + webpage_url
            # Skip this site, but continue trying the others.

        # Parse the XML
        dom_object = parse(page) 
        # Get all the elements with the observation "ob" tag.
        ob_elements = dom_object.getElementsByTagName("ob")
        # Use the first three observations or less if not available.
        if ob_elements.length < 3:
            n_elements_to_use = ob_elements.length
            n_elements_to_use = 3
        # Attempt to write each of the first few observations to the database.
        for ob_index in list(range( n_elements_to_use)):
            print "ob_index =", ob_index
            # Initialize the weather dictionary: time and sensor data.
            weather_dic = {'station_number':'Null',
            weather_dic['station_name'] = "'%s'" % station_name
            weather_dic['station_number'] = "'%s'" % station_dictionary[station_name]['ID']
            # Get the local sensor time from the "time" attribute.
            time_value = ob_elements[ob_index].getAttribute("time")
            #print "time =", time_value
            # Get the epoch time.
            time_epoch = int(ob_elements[ob_index].getAttribute("utime"))
            #print "time_epoch= ", time_epoch 
            # Interpret this epoch time in the timezone of this computer (the gleaner).
            # Apparently this is a DLS form of the time.
            time_datetime_gleaner = datetime.datetime.fromtimestamp(time_epoch) 
            #print "time_datetime_gleaner = ", time_datetime_gleaner
            # Check if this gleaner time is daylight or standard. If daylight, convert it
            # back to standard time for the database.
            # This check for DaylightSavings time will use the gleaner time (T=1, F=0)
            dlsTF = time.localtime(float(time_epoch))[-1]
            if dlsTF:
                time_datetime_gleaner_std = time_datetime_gleaner - datetime.timedelta(hours=1)
                time_datetime_gleaner_std = time_datetime_gleaner
            print "time_datetime_gleaner_std =", time_datetime_gleaner_std
            # Shift this gleaner time to the native time zone (the local time zone where the sensor is).
            TZ_shift_net = station_dictionary[station_name]['TZShift_MN'] - TZShift_gleanermoved
            time_datetime_native_std = time_datetime_gleaner_std + datetime.timedelta(hours=TZ_shift_net)
            #print "time_datetime_native =", time_datetime_native
            weather_dic['epoch_xml'] = "'%d'" % time_epoch
            weather_dic['time_xml'] = "'%s'" % time_value
            # Make a time tuple so and extract the pieces of this standard sensor time (sst).
            sst = time_datetime_native_std.timetuple()
            weather_dic['time_native_std'] = "'%d/%d/%d %d:%d:%d'" % (sst[1], sst[2], sst[0], sst[3], sst[4], 0)
            weather_dic['MDY'] = "'%d/%d/%d'" % (sst[1], sst[2], sst[0])
            weather_dic['Hr'] = "'%d'" % sst[3]
            weather_dic['Min'] = "'%d'" % sst[4]
            # Get all the "variable" tags from this observation node.
            variables = ob_elements[ob_index].getElementsByTagName("variable")
            for variable in variables:
                # print variable.getAttribute("var"), "=", variable.getAttribute("value")
                # Put these values into quotes for the SQL. If no data, use the unquoted Null (the default).
                weather_dic[variable.getAttribute("var")] = "'%s'" % variable.getAttribute("value")
                # If no wind, set the wind speed and gust value to zero, and the direction to Null. This keeps
                # the direction values, for these points, from contributing to the running average. Note that
                # MesoWest sets the direction value to 0 in these cases (not good).
                if (variable.getAttribute("var") == 'DDCARD') and (variable.getAttribute("value") == 'CALM'):
                    # DD always appears before DDCARD in the variables object, so this works 
                    # to do this in this for loop.
                    weather_dic['DD'] = 'Null'
                    # FF and FFGUST don't appear in the variable object (at all) if DDCARD is CALM. 
                    # So these won't be reset later in the loop.
                    weather_dic['FF'] = "'0'"
                    weather_dic['FFGUST'] = "'0'"
            if (weather_dic['FFGUST'] == 'Null'):
                # If no gust data, just make it equal to the regular wind speed.
                weather_dic['FFGUST'] = weather_dic['FF']
            # It appears that some negative wind directions are coming from MesoWest, so
            # null them if that is the case.
            #print "DD =", weather_dic['DD']
            if (weather_dic['DD'] <> 'Null'):
                # Strip of the quotes and then check for negative numbers.
                if (float(weather_dic['DD'][1:-1]) < 0):
                    weather_dic['DD'] = 'Null'
            #print "weather_dic =", weather_dic
            #print "T in dic =", weather_dic['T']
            #print "WEA in dic =", weather_dic['WEA']      

                # Develop SQL string
                sqlForStation = build_SQL_string(weather_dic) 

                # Run the SQL (send formal SQL string and info list). If a
                # successful write is indicated, bump up the counter.
                if (runSQL( sqlForStation[0], sqlForStation[1])):
                    write_count += 1
                    if (ob_index > 0):
                        write_count_notFirst += 1
                # Note the backslash is a line-continuation character.
                message_str = "general error in Station loop ::: %s ==> %s, \nStation name = %s" \
                              % (sys.exc_type, sys.exc_value, 
                                 getXMLvalueInStationGroup(dom_object, data_index, "station_id")) 
                print message_str
                print "SQL construction failed."
    # If there was a successful write to database, make entry in log showing
    # the number of successful gleans from the web site and the number of
    # successful writes to the database.
    # W = Writes
    # P = Possible (number of stations collecting from)
    message_str = "Data write record: %s P, %s W, %s W_NF" % (station_count, write_count, write_count_notFirst)
    print message_str
    if (write_count > 0):
        enterInLog( message_str) 

    return ''

def build_SQL_string(wd):

    # Input parameter is the weather dictionary (wd)

    sql_names = "INSERT INTO FifteenMinData (" +\
                    "PerlTime, DateTimeStamp, LiteralDateTimeStamp, TimeMDY, TimeHr, TimeMin, " +\
                    "StationNumber, StationName, WindDirection, WindSpeed, WindGust, " +\
                    "TempAvg, DewPoint, Pressure) "
    sql_values = "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" % (
                    wd['epoch_xml'], wd['time_native_std'], wd['time_xml'], wd['MDY'], wd['Hr'], wd['Min'], 
                    wd['station_number'], wd['station_name'], wd['DD'], wd['FF'], wd['FFGUST'], 
                    wd['T'], wd['TD'], wd['ALTSE'])

    sql_string = sql_names + sql_values
    print "SQL string = ", sql_string

    # Return SQL string and data list (to be used with error messages)
    return (sql_string, sql_values)

def runSQL(sql_string, row_ascii):

    # row_ascii: Contains the variable names and is useful in constructing error messages.

    successful_execution = False 

    # Execute SQL string
        successful_execution = True 
        # For testing
        #print "SQL = ", sql_string
        #print "row_ascii = ", row_ascii

    except pywintypes.com_error:   # catch this specific error
        # If only a "duplicate data" error, don't write to the log file
        if (string.find(str(sys.exc_value), "duplicate data") <> -1):
            print "Data already in database (sql warning)."
            message_str = "SQL error ::: %s ==> %s, \nData value = %s" % (sys.exc_type, sys.exc_value, row_ascii) 
            print message_str

        message_str = "general error ::: %s ==> %s, \nData value = %s" % (sys.exc_type, sys.exc_value, row_ascii) 
        print message_str

    else:  # Run this block if no errors...   
        print "Successful SQL execution!"

    # # For testing, clean out any new records for this date.
    # sql = "DELETE * FROM [FifteenMinData] WHERE ([TimeMDY] = #6/29/2006#)"
    # database_conn.Execute(sql)

def enterInLog( logentry):
    logFile.write('==============' + str(datetime.datetime.today()) + '====V ' + version_number + '\n')
    logFile.write( logentry + '\n')

def openConnections():

    # Open logFile
    global logFile

    # The getcwd os function returns the current working directory. Another
    # similar command is os.path.abspath(""). This returns "C:\\waconia" on
    # Physics2 and "C:\\code\\python\\scraper" at here at home. Note that if
    # this is running as a system AT job, this will return something like
    # "C:\\winnt\\system32"

    logFileDir = os.getcwd()
    # If it's not at home then it must be running on Physics2 
    if (logFileDir <> "C:\\code\\python\\scraper"):
        logFileDir = "C:\\Users\\Jim\\Documents\\webcontent\\waconia"
    logFileName = "ws_richland_log.txt"
    logFilePath = logFileDir + "\\" + logFileName 

    # Check for the file and create a new one if none is found.
    # Note: os.curdir doesn't work as a way to find the current directory when
    # if this task is scheduled. So I use the absolute paths above.
    if logFileName not in os.listdir(logFileDir):   
            logFile = open(logFilePath, 'w')  # create and write 
            sys.exit("Error when opening log file. Script stopped!!!")  # Shutdown if no way to log...

        # Header in file.
        logFile.write('==============VERSION '+ version_number +'========================' + '\n')
        logFile.write('File created: ' + str(datetime.datetime.today()) + '\n')
        logFile.write('=================================================' + '\n')
        logFile.write('=================================================' + '\n')
        logFile.write('=================================================' + '\n')
        logFile = open(logFilePath, 'a')  # append

    # Open Database connection
    global database_conn
    database_conn = win32com.client.Dispatch(r'ADODB.Connection')

    # Set path to the database depending on where this is running. 
    if (logFileDir == "C:\\code\\python\\scraper"):
        DSN = 'PROVIDER=Microsoft.Jet.OLEDB.4.0;DATA SOURCE=C:/code/python/scraper/telem.mdb;'  # At home 
        DSN = 'PROVIDER=Microsoft.Jet.OLEDB.4.0;DATA SOURCE=C:/Users/Jim/Documents/webcontent/waconia/data/telem.mdb;'  # On NUC

        enterInLog("Error ::: %s ==> %s" % (sys.exc_type, sys.exc_value))
        # Close logFile on the way out.
        sys.exit("Error when opening database connection. Script stopped!!!")

def closeConnections():
    database_conn.Close()  # Notice the CAPITAL C
    logFile.close()        # Notice the lowercase c

def attemptWriteToDaysGleaned():

    # This serves to keep the DaysGleaned table up to date. If this gleaner is
    # the first to cross into a new day (which it should since it looks at
    # central time zone sources), this should produce a successful write.

    # Make a local tuple out of the global time_datetime_maxDLSdate
    dT = time_datetime_maxDLSdate.timetuple()

    sql_string = "INSERT INTO DaysGleaned ([TimeMDY]) VALUES ('%s/%s/%s')" % (dT[1],dT[2],dT[0]) 
        # If not a "duplicate data" error, write to the log file
        if (string.find(str(sys.exc_value),"duplicate data") == -1):  # can't find it in string
            enterInLog("Error ::: %s ==> %s" % (sys.exc_type, sys.exc_value))

Main program

# global variables (any variable in Main that get assigned)

# This global is used for triggering a new day in the database's daysgleaned table.
time_datetime_maxDLSdate = datetime.datetime(2001,1,1) # initialize to some old date

# This also gets written to the log file.
version_number = "1.1"
print "Version Number = ", version_number

# Prepare to write to database and log file.

# This global is used to shift the time-zone shifts if the gleaner is ever
# moved from MN.
TZShift_gleanermoved = 0

# The airport site name/number/time-zone shift.
# Dictionary of dictionaries data structure used identify all the stations
# to be gleaned and associated parameters. Add more stations here if you like...

station_dic = {
   # WA and OR
   'KRLD':{'ID':'318','TZShift_MN':-2},   # Richland, WA
   'HOXO':{'ID':'354','TZShift_MN':-2},   # Hood River, OR
   'KOTH':{'ID':'356','TZShift_MN':-2},   # North Bend, OR
   'KMWH':{'ID':'357','TZShift_MN':-2},   # Moses Lake, WA
   # Fritz's sites on the cape...
   'KHSE.2':{'ID':'358','TZShift_MN':1},  # Cape Hatteras, NC
   'KCQX.2':{'ID':'359','TZShift_MN':1},  # Chatham, MA
   # Joe and Hawaii
   'PHOG':{'ID':'351','TZShift_MN':-4},   # Maui Airport, HI
   'PHJR':{'ID':'352','TZShift_MN':-4},   # Oahu, Kalaeloa Airport, HI
   'PHBK':{'ID':'353','TZShift_MN':-4},   # Kauai, Barking Sands Airport, HI
   # MN
   'KMKT.2':{'ID':'355','TZShift_MN':0},  # Mankato, MN
   # Columbia River (for delta-p chart)
   'KDLS':{'ID':'166','TZShift_MN':-2},   # Dalles, WA 
   'KTTD':{'ID':'167','TZShift_MN':-2},   # Troutdale, OR 
   'KHRI':{'ID':'168','TZShift_MN':-2}    # Hermiston, OR 

# Fetch one XML page per station. Parse and write to database once for each station. 

# Close connections to database and log file.