"""
Description: An XML gleaner for collecting weather data.
Written by: Jim Miller
The site that is queried.
http://www.wrh.noaa.gov/mesowest/getobextXml.php?sid=KRLD&num=1
An example of the XML report:
<station id="KRLD" name="Richland AP AWOS" elev="394" lat="46.3056" lon="-119.30419" provider="PDTWFO">
<ob time="29 Apr 8:50 am PDT" utime="1430322600"><variable var="T" description="Temp" unit="F" value="57"/>
<variable var="TD" description="Dewp" unit="F" value="32"/><variable var="RH" description="Relh" unit="%" value="38"/>
<variable var="DD" description="Direction" unit="deg" value="240"/>
<variable var="DDCARD" description="Wind Card" unit="direction" value="WSW"/>
<variable var="FF" description="Wind" unit="mph" value="13"/>
<variable var="FFGUST" description="Gust" unit="mph" value="17"/>
<variable var="VV" description="Visibility" unit="miles" value="10.00"/>
<variable var="WEA" description="Weather" unit="" value="Fair"/>
<variable var="PRESWEA" description="Coded Weather" unit="" value=""/>
<variable var="SKY" description="Clouds" unit="" value="CLR"/>
<variable var="ALTSE" description="Altimeter" unit="inches" value="30.19"/>
<variable var="P" description="Station Pressure" unit="inches" value="29.772"/>
<variable var="STAQUAL" description="Station Quality" unit="" value="OK"/>
</ob>
</station>
Some more example queries:
Time series, XML output (using their API and a demo token)
http://api.mesowest.net/v2/stations/timeseries?stid=KMKT&start=201607040000&end=201607050000&obtimezone=local&output=xml&token=1234567890
Time series, JSON output (using API)
http://api.mesowest.net/v2/stations/timeseries?stid=KMKT&start=201607040000&end=201607050000&obtimezone=local&output=json&token=1234567890
Most recent record (using API)
http://api.mesowest.net/v2/stations/nearesttime?stid=KMKT&within=120&obtimezone=local&output=xml&token=1234567890
Time series (using NOAA interface)
http://www.wrh.noaa.gov/mesowest/getobextXml.php?sid=KMKT&num=2
"""
# Import supporting modules
# System and OS
import sys, os
# Strings
import string
# HTML fetching
import urllib2
# Windows extensions (pywin32-220.win32-py2.7.exe)
import win32com.client
import pywintypes # supports exceptions names
# time and date functionality
import time
import datetime
# In support of XML parsing
from xml.dom.minidom import parse
"""
Functions
"""
def timeA_GT_global(timeA):
# Check to see if A is later than what's in the global.
# and replace the global if it is...
# Must declare locally to change a global. If no global declaration, the
# ASSIGNMENT statement will force this variable to be local here in this
# function.
global time_datetime_maxDLSdate
if (timeA > time_datetime_maxDLSdate):
time_datetime_maxDLSdate = timeA
def processMultipleStations_xml(station_dictionary):
# Build the URL string to run a query for one station.
# (returned on one page).
# http://www.wrh.noaa.gov/mesowest/getobextXml.php?num=1&sid=KRLD
url_base = "http://www.wrh.noaa.gov/mesowest/getobextXml.php?num=2&sid="
station_count = 0
write_count = 0
# This index keeps track of successful writes that happen when attempting
# to write observations other then the first one.
write_count_notFirst = 0
for station_name in station_dictionary.keys():
print ""
print "station name =", station_name
station_count += 1
webpage_url = url_base + station_name.split(".")[0]
try:
# Fetch the XML page
request = urllib2.Request(webpage_url, headers={'User-Agent': 'Mozilla/5.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'})
page = urllib2.urlopen(request)
except:
message_str = "Error opening url for %s ::: %s ==> %s" % (station_name, sys.exc_type, sys.exc_value)
enterInLog( message_str)
print message_str + ", URL = " + webpage_url
# Skip this site, but continue trying the others.
continue
try:
# Parse the XML
dom_object = parse(page)
except:
message_str = "Error parsing XML for %s ::: %s ==> %s" % (station_name, sys.exc_type, sys.exc_value)
enterInLog( message_str)
print message_str + ", URL = " + webpage_url
# Skip this site, but continue trying the others.
continue
# Get all the elements with the observation "ob" tag.
ob_elements = dom_object.getElementsByTagName("ob")
# Use the first three observations or less if not available.
if ob_elements.length < 3:
n_elements_to_use = ob_elements.length
else:
n_elements_to_use = 3
# Attempt to write each of the first few observations to the database.
for ob_index in list(range( n_elements_to_use)):
print "ob_index =", ob_index
# Initialize the weather dictionary: time and sensor data.
weather_dic = {'station_number':'Null',
'station_name':'Null',
'epoch_xml':'Null',
'time_xml':'Null',
'time_native_std':'Null',
'MDY':'Null',
'Hr':'Null',
'Min':'Null',
'T':'Null',
'TD':'Null',
'DD':'Null',
'DDCARD':'Null',
'FF':'Null',
'FFGUST':'Null',
'ALTSE':'Null',
'P':'Null'}
weather_dic['station_name'] = "'%s'" % station_name
weather_dic['station_number'] = "'%s'" % station_dictionary[station_name]['ID']
# Get the local sensor time from the "time" attribute.
time_value = ob_elements[ob_index].getAttribute("time")
#print "time =", time_value
# Get the epoch time.
time_epoch = int(ob_elements[ob_index].getAttribute("utime"))
#print "time_epoch= ", time_epoch
# Interpret this epoch time in the timezone of this computer (the gleaner).
# Apparently this is a DLS form of the time.
time_datetime_gleaner = datetime.datetime.fromtimestamp(time_epoch)
#print "time_datetime_gleaner = ", time_datetime_gleaner
# Check if this gleaner time is daylight or standard. If daylight, convert it
# back to standard time for the database.
# This check for DaylightSavings time will use the gleaner time (T=1, F=0)
dlsTF = time.localtime(float(time_epoch))[-1]
if dlsTF:
time_datetime_gleaner_std = time_datetime_gleaner - datetime.timedelta(hours=1)
else:
time_datetime_gleaner_std = time_datetime_gleaner
print "time_datetime_gleaner_std =", time_datetime_gleaner_std
# Shift this gleaner time to the native time zone (the local time zone where the sensor is).
TZ_shift_net = station_dictionary[station_name]['TZShift_MN'] - TZShift_gleanermoved
time_datetime_native_std = time_datetime_gleaner_std + datetime.timedelta(hours=TZ_shift_net)
#print "time_datetime_native =", time_datetime_native
weather_dic['epoch_xml'] = "'%d'" % time_epoch
weather_dic['time_xml'] = "'%s'" % time_value
# Make a time tuple so and extract the pieces of this standard sensor time (sst).
sst = time_datetime_native_std.timetuple()
weather_dic['time_native_std'] = "'%d/%d/%d %d:%d:%d'" % (sst[1], sst[2], sst[0], sst[3], sst[4], 0)
weather_dic['MDY'] = "'%d/%d/%d'" % (sst[1], sst[2], sst[0])
weather_dic['Hr'] = "'%d'" % sst[3]
weather_dic['Min'] = "'%d'" % sst[4]
# Get all the "variable" tags from this observation node.
variables = ob_elements[ob_index].getElementsByTagName("variable")
for variable in variables:
# print variable.getAttribute("var"), "=", variable.getAttribute("value")
# Put these values into quotes for the SQL. If no data, use the unquoted Null (the default).
weather_dic[variable.getAttribute("var")] = "'%s'" % variable.getAttribute("value")
# If no wind, set the wind speed and gust value to zero, and the direction to Null. This keeps
# the direction values, for these points, from contributing to the running average. Note that
# MesoWest sets the direction value to 0 in these cases (not good).
if (variable.getAttribute("var") == 'DDCARD') and (variable.getAttribute("value") == 'CALM'):
# DD always appears before DDCARD in the variables object, so this works
# to do this in this for loop.
weather_dic['DD'] = 'Null'
# FF and FFGUST don't appear in the variable object (at all) if DDCARD is CALM.
# So these won't be reset later in the loop.
weather_dic['FF'] = "'0'"
weather_dic['FFGUST'] = "'0'"
if (weather_dic['FFGUST'] == 'Null'):
# If no gust data, just make it equal to the regular wind speed.
weather_dic['FFGUST'] = weather_dic['FF']
# It appears that some negative wind directions are coming from MesoWest, so
# null them if that is the case.
#print "DD =", weather_dic['DD']
if (weather_dic['DD'] <> 'Null'):
# Strip of the quotes and then check for negative numbers.
if (float(weather_dic['DD'][1:-1]) < 0):
weather_dic['DD'] = 'Null'
#print "weather_dic =", weather_dic
#print "T in dic =", weather_dic['T']
#print "WEA in dic =", weather_dic['WEA']
try:
# Develop SQL string
sqlForStation = build_SQL_string(weather_dic)
# Run the SQL (send formal SQL string and info list). If a
# successful write is indicated, bump up the counter.
if (runSQL( sqlForStation[0], sqlForStation[1])):
write_count += 1
if (ob_index > 0):
write_count_notFirst += 1
except:
# Note the backslash is a line-continuation character.
message_str = "general error in Station loop ::: %s ==> %s, \nStation name = %s" \
% (sys.exc_type, sys.exc_value,
getXMLvalueInStationGroup(dom_object, data_index, "station_id"))
enterInLog(message_str)
print message_str
print "SQL construction failed."
# If there was a successful write to database, make entry in log showing
# the number of successful gleans from the web site and the number of
# successful writes to the database.
# W = Writes
# P = Possible (number of stations collecting from)
message_str = "Data write record: %s P, %s W, %s W_NF" % (station_count, write_count, write_count_notFirst)
print message_str
if (write_count > 0):
enterInLog( message_str)
attemptWriteToDaysGleaned()
return ''
def build_SQL_string(wd):
# Input parameter is the weather dictionary (wd)
sql_names = "INSERT INTO FifteenMinData (" +\
"PerlTime, DateTimeStamp, LiteralDateTimeStamp, TimeMDY, TimeHr, TimeMin, " +\
"StationNumber, StationName, WindDirection, WindSpeed, WindGust, " +\
"TempAvg, DewPoint, Pressure) "
sql_values = "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" % (
wd['epoch_xml'], wd['time_native_std'], wd['time_xml'], wd['MDY'], wd['Hr'], wd['Min'],
wd['station_number'], wd['station_name'], wd['DD'], wd['FF'], wd['FFGUST'],
wd['T'], wd['TD'], wd['ALTSE'])
sql_string = sql_names + sql_values
print "SQL string = ", sql_string
# Return SQL string and data list (to be used with error messages)
return (sql_string, sql_values)
def runSQL(sql_string, row_ascii):
# row_ascii: Contains the variable names and is useful in constructing error messages.
successful_execution = False
# Execute SQL string
try:
database_conn.Execute(sql_string)
successful_execution = True
# For testing
#print "SQL = ", sql_string
#print "row_ascii = ", row_ascii
except pywintypes.com_error: # catch this specific error
# If only a "duplicate data" error, don't write to the log file
if (string.find(str(sys.exc_value), "duplicate data") <> -1):
print "Data already in database (sql warning)."
else:
message_str = "SQL error ::: %s ==> %s, \nData value = %s" % (sys.exc_type, sys.exc_value, row_ascii)
enterInLog(message_str)
print message_str
return(successful_execution)
except:
message_str = "general error ::: %s ==> %s, \nData value = %s" % (sys.exc_type, sys.exc_value, row_ascii)
enterInLog(message_str)
print message_str
return(successful_execution)
else: # Run this block if no errors...
print "Successful SQL execution!"
return(successful_execution)
# # For testing, clean out any new records for this date.
# sql = "DELETE * FROM [FifteenMinData] WHERE ([TimeMDY] = #6/29/2006#)"
# database_conn.Execute(sql)
def enterInLog( logentry):
logFile.write('==============' + str(datetime.datetime.today()) + '====V ' + version_number + '\n')
logFile.write( logentry + '\n')
def openConnections():
# Open logFile
global logFile
# The getcwd os function returns the current working directory. Another
# similar command is os.path.abspath(""). This returns "C:\\waconia" on
# Physics2 and "C:\\code\\python\\scraper" at here at home. Note that if
# this is running as a system AT job, this will return something like
# "C:\\winnt\\system32"
logFileDir = os.getcwd()
# If it's not at home then it must be running on Physics2
if (logFileDir <> "C:\\code\\python\\scraper"):
logFileDir = "C:\\Users\\Jim\\Documents\\webcontent\\waconia"
logFileName = "ws_richland_log.txt"
logFilePath = logFileDir + "\\" + logFileName
# Check for the file and create a new one if none is found.
# Note: os.curdir doesn't work as a way to find the current directory when
# if this task is scheduled. So I use the absolute paths above.
if logFileName not in os.listdir(logFileDir):
try:
logFile = open(logFilePath, 'w') # create and write
except:
sys.exit("Error when opening log file. Script stopped!!!") # Shutdown if no way to log...
# Header in file.
logFile.write('==============VERSION '+ version_number +'========================' + '\n')
logFile.write('File created: ' + str(datetime.datetime.today()) + '\n')
logFile.write('=================================================' + '\n')
logFile.write('=================================================' + '\n')
logFile.write('=================================================' + '\n')
else:
logFile = open(logFilePath, 'a') # append
# Open Database connection
global database_conn
database_conn = win32com.client.Dispatch(r'ADODB.Connection')
# Set path to the database.
DSN = 'PROVIDER=Microsoft.Jet.OLEDB.4.0;DATA SOURCE=C:/Users/Jim/Documents/webcontent/waconia/data/telem.mdb;'
try:
database_conn.Open(DSN)
except:
enterInLog("Error ::: %s ==> %s" % (sys.exc_type, sys.exc_value))
# Close logFile on the way out.
logFile.close()
sys.exit("Error when opening database connection. Script stopped!!!")
def closeConnections():
database_conn.Close() # Notice the CAPITAL C
logFile.close() # Notice the lowercase c
def attemptWriteToDaysGleaned():
# This serves to keep the DaysGleaned table up to date. If this gleaner is
# the first to cross into a new day (which it should since it looks at
# central time zone sources), this should produce a successful write.
# Make a local tuple out of the global time_datetime_maxDLSdate
dT = time_datetime_maxDLSdate.timetuple()
sql_string = "INSERT INTO DaysGleaned ([TimeMDY]) VALUES ('%s/%s/%s')" % (dT[1],dT[2],dT[0])
try:
database_conn.Execute(sql_string)
except:
# If not a "duplicate data" error, write to the log file
if (string.find(str(sys.exc_value),"duplicate data") == -1): # can't find it in string
enterInLog("Error ::: %s ==> %s" % (sys.exc_type, sys.exc_value))
"""
Main program
"""
# global variables (any variable in Main that get assigned)
# This global is used for triggering a new day in the database's daysgleaned table.
time_datetime_maxDLSdate = datetime.datetime(2001,1,1) # initialize to some old date
# This also gets written to the log file.
version_number = "1.1"
print "Version Number = ", version_number
# Prepare to write to database and log file.
openConnections()
# This global is used to shift the time-zone shifts if the gleaner is ever
# moved from MN.
TZShift_gleanermoved = 0
# The airport site name/number/time-zone shift.
# Dictionary of dictionaries data structure used identify all the stations
# to be gleaned and associated parameters. Add more stations here if you like...
station_dic = {
# WA and OR
'KRLD':{'ID':'318','TZShift_MN':-2}, # Richland, WA
'HOXO':{'ID':'354','TZShift_MN':-2}, # Hood River, OR
'KOTH':{'ID':'356','TZShift_MN':-2}, # North Bend, OR
'KMWH':{'ID':'357','TZShift_MN':-2}, # Moses Lake, WA
# Fritz's sites on the cape...
'KHSE.2':{'ID':'358','TZShift_MN':1}, # Cape Hatteras, NC
'KCQX.2':{'ID':'359','TZShift_MN':1}, # Chatham, MA
# Joe and Hawaii
'PHOG':{'ID':'351','TZShift_MN':-4}, # Maui Airport, HI
'PHJR':{'ID':'352','TZShift_MN':-4}, # Oahu, Kalaeloa Airport, HI
'PHBK':{'ID':'353','TZShift_MN':-4}, # Kauai, Barking Sands Airport, HI
# MN
'KMKT.2':{'ID':'355','TZShift_MN':0}, # Mankato, MN
# Columbia River (for delta-p chart)
'KDLS':{'ID':'166','TZShift_MN':-2}, # Dalles, WA
'KTTD':{'ID':'167','TZShift_MN':-2}, # Troutdale, OR
'KHRI':{'ID':'168','TZShift_MN':-2} # Hermiston, OR
}
# Fetch one XML page per station. Parse and write to database once for each station.
processMultipleStations_xml(station_dic)
# Close connections to database and log file.
closeConnections()