Source code for analyze

"""A module containing various methods for analyzing the cloudiness of images.

This module is designed to be used in conjunction with downloaded images to
perform statistical analyses and plotting. Methods are provided for
analyzing months of data and saving the cloudiness values to a file. It is
then possible to take this data and plot it in various forms, across a variety
of different plots. The cloudiness can be plotted as a histogram, and two
different models can be fit to the histograms. These two models are a double
Gaussian and a Gaussian-Poisson hybrid.
"""
import os
import time
import ast
from scipy import optimize
from scipy import special
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ephem

import io_util
import moon
import histogram
import coordinates
import image


[docs]def get_latest_analyzed(): """Find the latest analyzed date. Returns ------- str The latest date analyzed in yyyymmdd format. Notes ----- The latest date analyzed is found by searching through Data/ for the latest month that has been analyzed, and then searching through that month for the latest day that has been analyzed. """ # Gets the downloaded months directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed"]) months = sorted(os.listdir(directory)) # If no dates have been analyzed yet return 0 if not months or months[-1] == ".DS_Store": return 0 # Gets the singular days that have been analyzed directory = os.path.join(directory, months[-1]) days = sorted(os.listdir(directory)) # If no days for this month have been analyzed yet return the first day. if not days: start = months[-1] + "01" return start # Return the final day analyzed. We will reanalyze this completely, # in case the analysis crashed mid day. # Second slice slices off the .txt start = days[-1][:-4] return start
[docs]def analyze(): """Determine cloudiness of images for every day from January 1, 2017 onward. See Also -------- histogram.cloudiness : Calculate the cloudiness of a histogram. Notes ----- For each night, all images taken that day are downloaded. For every night analyzed, the cloudiness values for every image taken during that night are written to "Data/yyyymm/yyyymmdd.txt" where yyyymmdd corresponds to the date analyzed. """ t1 = time.perf_counter() link = "http://kpasca-archives.tuc.noao.edu/" rlink = io_util.download_url(link) if rlink is None: print("Getting dates failed.") return html = rlink.text # This parser was designed to work with image names but it works the same # for dates so why make a new one? parser = io_util.DateHTMLParser() parser.feed(html) parser.close() datelinks = parser.data parser.clear_data() # 20080306 is the first date the camera is located correctly # 20080430 is the first date the images are printed correctly startdate = 20100101 # Reads in the model coefficients. cloud_loc = os.path.join(os.path.dirname(__file__), *["data", "clouds.txt"]) with open(cloud_loc, "r") as f: for line in f: line = line.rstrip() line = line.split(",") b = float(line[0]) c = float(line[1]) fails = [] for date in datelinks: # Strips the /index from the date so we have just the date d = date[:8] # The month of the date for organization purposes. month = d[:6] # This makes it so we start with the date we left off with. if int(d) < startdate: continue if not 20140726 <= int(d) <= 20141231: continue # Print a new line to differentiate dates. print() print(d) rdate = io_util.download_url(link + date) # If we fail to get the data for the date, log it and move to the next. if rdate is None: fails.append(date) print("Failed to retrieve data for " + date) continue # Extracting the image names. htmldate = rdate.text parser.feed(htmldate) parser.close() imagenames = parser.data parser.clear_data() monthloc = os.path.join(os.path.dirname(__file__), *["data", "analyzed", month]) if not os.path.exists(monthloc): os.makedirs(monthloc) datafile = os.path.join(monthloc, d + ".txt") with open(datafile, "w") as f: for name in imagenames: # This ignores all the blue filter images, since we don"t want # to process those. if name[:1] == "b": continue # We want to ignore the all image animations if name == "allblue.gif" or name == "allred.gif": continue # This creates an AllSkyImage we use for subsequent methods. # This is the path the image is saved to. # We need to check if it exists first, just in case. path = os.path.join(os.path.dirname(__file__), *["Images", "Original", "KPNO", d, name]) # Download the image, if it hasn"t been downloaded if not os.path.isfile(path): io_util.download_image(d, name) img = image.load_image(name, d, "KPNO") # Finds the moon and the sun in the image. # We only process images where the moon is visble (alt > 0) # And the sun is low enough to not wash out the image # (sun alt < -17) moonalt = moon.find_moon(img)[2] sunalt = moon.find_sun(img)[0] # Checks that the analysis conditions are met. if moonalt > 0 and sunalt < -17: # Then we make a histogram and a "cloudiness fraction" # Generates the moon mask. mask = moon.moon_mask(img) hist = histogram.generate_histogram(img, mask) frac = histogram.cloudiness(hist) # Correction for moon phase. phase = moon.moon_phase(img) val = b * phase * phase + c * phase frac = frac / val # Then we save the cloudiness fraction to the file for that # date. dataline = name + "," + str(frac) + "\n" f.write(dataline) print("Analyzed: " + img.formatdate) # Deletes an image if we didn"t analyze it. else: os.remove(path) t2 = time.perf_counter() print(t2 - t1) print("The following dates failed to download: " + str(fails))
[docs]def month_plot(): """Generate a plot of nightly cloudiness for each month. Notes ----- The cloudiness values for every image saved using :func:`~analyze` are loaded. These values are plotted, with all images for a given month on a single plot. Plots are saved in "Images/Plots/" with the filename "scatter-`month`.png". """ # Gets the downloaded months directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed"]) # If the data directory doesn"t exist we should exit here. if not os.path.exists(directory): print("No data found.") return months = sorted(os.listdir(directory)) # Macs are dumb if ".DS_Store" in months: months.remove(".DS_Store") for month in months: # Gets the days that were analyzed for that month directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed", month]) days = sorted(os.listdir(directory)) data = [] illum = [] x = [] # Reads the data for each day. for day in days: d1 = directory + day f1 = open(d1, "r") # Strips off the .txt so we can make a Time object. day = day[:-4] for line in f1: line = line.rstrip() line = line.split(",") # Gets the plot date for the timestring object. d = coordinates.timestring_to_obj(day, line[0]).plot_date data.append(float(line[1])) x.append(d) vis = moon.moon_visible(day, line[0]) illum.append(vis) f1.close() # Sets up the plot fig, ax = plt.subplots() fig.set_size_inches(20, 5) ax.plot_date(x, data, xdate=True, markersize=1) ax.plot_date(x, illum, xdate=True, markersize=1, color="red") # These are the x divisions which indicate the start of each day. xt = [] start = int(days[0][:-4]) end = int(days[-1][:-4]) + 1 for i in range(start, end): # Finds the plot_date for the start of the day. x1 = coordinates.timestring_to_obj(str(i), "r_ut000000s00000").plot_date xt.append(x1) # Need the right end to be the start of next month. # Adding 100 increments the month number. # First if statement checks if this is december in which case we need # to roll over the year and reset the month. if str(start)[4:6] == "12": end = str(start + 8900) else: end = str(start + 100) d2 = coordinates.timestring_to_obj(end, "r_ut000000s00000").plot_date # Sets the limits and division ticks. ax.set_ylim(0, 1.0) ax.set_xlim(xt[0], d2) ax.set_xticks(xt) ax.xaxis.grid(True) ax.xaxis.set_ticklabels([]) # Sets the axis labels and saves. ax.set_ylabel("Cloudiness Fraction") # Puts the date in a nice form for the plot axis. date = month[-2:] + "/01/" + month[:-2] ax.set_xlabel("Time After " + date) plotloc = os.path.join(os.path.dirname(__file__), *["Images", "Plots"]) if not os.path.exists(plotloc): os.makedirs(plotloc) plt.savefig(plotloc + "scatter-" + month + ".png", dpi=256, bbox_inches="tight") plt.close()
# Sets up a pyephem object for the camera. camera = ephem.Observer() camera.lat = "31.959417" camera.lon = "-111.598583" camera.elevation = 2120 camera.horizon = "-17"
[docs]def plot(years=["2015","2016","2017"], fit_histograms=False): """Generate various cloudiness plots and save them to Images/Plots. Parameters ---------- years : list, optional List of years to include in the plots. Defaults to ["2015","2016","2017"]. fit_histograms : bool, optional To fit the weekly histograms and plot the resulting fit variables or not. Defaults to False. Notes ----- The cloudiness values for every image saved using :func:`~analyze` are loaded. These values are further analyzed to create new plots. The generated plots include plots of Cloudiness vs Moon Phase, Cloudiness vs Hours since sunset, Cloudiness vs Normalized time after sunset, Cloudiness vs Hours before sunrise, and Cloudiness vs Week Number. These plots are saved to Images/Plots. If `fit_histograms` is true, the weekly histograms will be fit by a double Gaussian or Gaussian-Poisson hybrid function, after which plots of week number versus the various fit parameters will be made and saved as well. """ # Gets the downloaded months directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed"]) # If the data directory doesn"t exist we should exit here. if not os.path.exists(directory): print("No data found.") return # Temporary arrays for moon phase plots phasenum = 50 phasediv = 1 / phasenum # Phase ranges from 0-1 so divisions is 1/num divs tphase = [[] for i in range(0, phasenum)] # Temporary arrays for sunset plots sunsetnum = 50 sunsetdiv = 0.5 / sunsetnum # Up to 12 hours after sunset = 0.5 day / divs tsunset = [[] for i in range(0, sunsetnum)] # Temporary arrays for sunrise plots sunrisenum = 50 sunrisediv = 0.5 / sunrisenum # 12 hours before sunrise = 0.5 day / divs tsunrise = [[] for i in range(0, sunrisenum)] # Temp other stuff normalnum = 50 normaldiv = 1. / normalnum tnormal = [[] for i in range(0, normalnum)] # Temporary arrays for week plots tweek = [[] for i in range(0, 53)] tmoon = [[] for i in range(0, 53)] months = sorted(os.listdir(directory)) # Macs are dumb if ".DS_Store" in months: months.remove(".DS_Store") for month in months: # Gets the days that were analyzed for that month directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed", month]) days = sorted(os.listdir(directory)) # Strips out the year from the month year = month[:4] if not year in years: continue # Day 1 of the year, for week calculation. yearstart = year + "0101" day1 = coordinates.timestring_to_obj(yearstart, "r_ut000000s00000") # Reads the data for each day. for day in days: loc = os.path.join(directory, day) f1 = open(loc, "r") # Strips off the .txt so we can make a Time object. day = day[:-4] for line in f1: # Splits out the value and file. line = line.rstrip() line = line.split(",") val = float(line[1]) name = line[0] # A phantom image for calculation that doesn"t actually have # any data in it. img = image.AllSkyImage(name, day, "KPNO", None) # Moon phase calculation. phase = moon.moon_phase(img) # Ignores cloudiness with moon phase less than 0.2 if phase < 0.2: continue b = int(phase // phasediv) tphase[b].append(val) formatdate = img.formatdate # Sets the date of calculation. camera.date = formatdate date = ephem.Date(formatdate) # Calculates the previous setting and next rising of the sun. sun = ephem.Sun() setting = camera.previous_setting(sun, use_center=True) rising = camera.next_rising(sun, use_center=True) # Finds the difference and bins it into the correct bin. diff = date - setting length = rising - setting b = int(diff // sunsetdiv) b2 = int((diff / length) // normaldiv) tsunset[b].append(val) tnormal[b2].append(val) # Finds the difference and bins it into the correct bin. diff = rising - date b = int(diff // sunrisediv) tsunrise[b].append(val) # Week of the year calculation. # Gets the date object for the image date = img.time # Finds the difference since the beginning of the year to find # The week number. diff = date - day1 week = int(diff.value // 7) tweek[week].append(val) tmoon[week].append(phase) # Prints the progress if this is the final month of the year and we # just finished it. if month[4:] == "12": print("Collated " + year + " data") percents = ["25", "50", "75"] colors = [(1, 0, 0, 1), (0, 0, 1, 1), (1, 1, 0, 1)] # Nan array for if there"s no data for that bin. nanarray = np.asarray([[float("nan"), float("nan"), float("nan")]]) # This method sets up the plots. using the given x array and the data set # it finds the percentile values for each bin division. def setup_plot(x, dataset): # We only need this so the axis shape works out. We delete it later. data = np.asarray([[0, 0, 0]]) for i, val in enumerate(dataset): temp = np.asarray(val) # Percentile returns an array of each of the three values. # We"re creating a data array where each column is all the # percentile values for each bin value. if temp.size == 0: data = np.append(data, nanarray, axis=0) else: d = np.reshape(np.percentile(temp, [25, 50, 75]), (1, 3)) data = np.append(data, d, axis=0) # Deletes the first 0,0,0 array. data = np.delete(data, 0, 0) plt.ylim(0, 4.0) # This sets up the individual plots. You can just pass the data # multiararay but I need each line to be individually labeled. for i, val in enumerate(percents): plt.plot(x, data[0:data.shape[0], i], label=val + "%", color=colors[i]) # This fills between the lowest and top percentiles. plt.fill_between(x, data[0:data.shape[0], 0], data[0:data.shape[0], -1], color=(1, 0.5, 0, 0.5)) plt.legend() # Plotting and averaging code # Moon phase plt.ylabel("Cloudiness Relative to Mean") plt.xlabel("Moon Phase") x = np.asarray((range(0, phasenum))) x = x * phasediv + (phasediv / 2.) setup_plot(x, tphase) plt.savefig("Images/Plots/phase.png", dpi=256, bbox_inches="tight") plt.close() # Sunset plt.ylabel("Cloudiness Relative to Mean") plt.xlabel("Hours since sunset") x = np.asarray((range(0, sunsetnum))) x = x * sunsetdiv * 24 + (sunsetdiv * 12) setup_plot(x, tsunset) plt.savefig("Images/Plots/sunset.png", dpi=256, bbox_inches="tight") plt.close() # Normalized plt.ylabel("Cloudiness Relative to Mean") plt.xlabel("Normalized time after sunset") x = np.asarray((range(0, normalnum))) x = x * normaldiv + (normaldiv / 2.) setup_plot(x, tnormal) plt.savefig("Images/Plots/normalized-0.png", dpi=256, bbox_inches="tight") plt.close() # Sunrise x = np.asarray((range(0, sunrisenum))) x = x * sunrisediv * 24 + (sunrisediv * 12) # Sets up the plot before we plot the things plt.ylabel("Cloudiness Relative to Mean") plt.xlabel("Hours before sunrise") setup_plot(x, tsunrise) plt.savefig("Images/Plots/sunrise.png", dpi=256, bbox_inches="tight") plt.close() # Week x = np.asarray((range(1, 54))) #plt.ylabel("Cloudiness Relative to Mean") plt.xlabel("Week Number") plt.ylabel("Cloudiness Relative to Mean") # Moon phase averages. moon_avgs = [] num_imgs = [] for i, val in enumerate(tweek): moon_avgs.append(np.mean(tmoon[i])) num_imgs.append(len(val)) setup_plot(x, tweek) #Reads in the csv file using pandas. domedata = pd.read_csv("daily-2007-2017.csv") closedict = dict.fromkeys(years) for y in years: name = "Y" + y closedict[y] = domedata.get(name).values tclose = [[] for i in range(0, 53)] closeav = [] # i represents the day number of the year in 2016. If it is larger than 59 # (31 + 28) we need to increase the day by 1 to account for the leap day. # This is because the domedata csv does not include February 29. # For example, March 1st is considered day 60, when in 2016 it # is actually day 61. for j, val in enumerate(closedict["2016"]): if j >= 60: i = j + 1 else: i = j week1 = j // 7 week2 = i // 7 for key, val in closedict.items(): # We need to append to a different week if it's a leap year, due # to the leap day changing the day number of the year after # February. week = week2 if int(key) % 4 == 0 else week1 tclose[week].append(val[j]) for i, val in enumerate(tclose): closeav.append(np.mean(val)) # Alternative additional data. #plt.plot(x, moons, label="Moon Phase", color=(0, 1, 0, 1)) #num_imgs = np.asarray(num_imgs) #num_imgs = num_imgs * 1 / (np.amax(num_imgs)) #plt.plot(x, num_imgs, label="Normalized number of images", #color=(0, 1, 0, 1)) plt.plot(x, closeav, label="Average Closed Fraction", color=(0,1,0,1)) plt.legend() plt.savefig("Images/Plots/week.png", dpi=256, bbox_inches="tight") plt.close() if fit_histograms: data = np.asarray([[0, 0, 0, 0, 0]]) split = 10 w = 0.61 / split num = np.amax(tweek[2]) / w divs = np.asarray(range(0, int(num) + 1)) divs = divs * w for i, val in enumerate(tweek): temp = np.asarray(val) # Percentile returns an array of each of the three values. # We"re creating a data array where each column is all the # percentile values for each bin value. if temp.size == 0: data = np.append(data, nanarray, axis=0) else: # Passes the data to the fitting method. temp = np.delete(temp, np.where(temp < 0.061)) d, _ = find_fit(temp, divs) # Inserts a the sqaure root of the lambda for the mu-2 if the fit # Is a gaussian-poisson combo. if len(d) < 5: d = np.asarray(d) d = np.insert(d, 3, np.sqrt(d[2])) d = d.reshape(1,5) data = np.append(data, d, axis=0) # Deletes the first 0,0,0 array. data = np.delete(data, 0, 0) # Code to plot the fits, abstracted because it was getting a little # cumbersome to copy paste. def fit_plot(x, data, name): loc = "Images/Plots/week-" + name.lower() + ".png" plt.plot(x, data, label=name) plt.scatter(x, data, label=name, s=2, c="r") plt.xlabel("Week Number") plt.ylabel("Cloudiness Relative to Mean") plt.legend() plt.savefig(loc, dpi=256, bbox_inches="tight") plt.close() # This code plots the variables individually sigma1 = data[0:data.shape[0], 0] fit_plot(x, sigma1, "Sigma-1") mu1 = data[0:data.shape[0], 1] fit_plot(x, mu1, "Mu-1") cv1 = sigma1 / mu1 fit_plot(x, cv1, "CV-1") sigma2 = data[0:data.shape[0], 2] fit_plot(x, sigma2, "Sigma-2") mu2 = data[0:data.shape[0], 3] print(np.argmin(mu2)) mu2 = np.where(mu2 < -100, mu1, mu2) fit_plot(x, mu2, "Mu-2") cv2 = sigma2 / mu2 fit_plot(x, cv2, "CV-2") frac = data[0:data.shape[0], 4] frac = (np.tanh(frac) + 1) / 2 fit_plot(x, frac, "Frac") plt.plot(x, mu1 + mu2, label="Plus") plt.scatter(x, mu1 + mu2, label="Plus", s=2, c="r") plt.plot(x, np.abs(mu1 - mu2), label="Minus") plt.scatter(x, np.abs(mu1 - mu2), label="Minus", s=2, c="g") plt.xlabel("Week Number") plt.legend() plt.savefig("Images/Plots/week-mu-test.png", dpi=256, bbox_inches="tight") plt.close() plt.plot(x, sigma1 + sigma2, label="Plus") plt.scatter(x, sigma1 + sigma2, label="Plus", s=2, c="r") plt.plot(x, np.abs(sigma1 - sigma2), label="Minus") plt.scatter(x, np.abs(sigma1 - sigma2), label="Minus", s=2, c="g") plt.xlabel("Week Number") plt.legend() plt.savefig("Images/Plots/week-sigma-test.png", dpi=256, bbox_inches="tight") plt.close() plt.plot(x, cv1 + cv2, label="Plus") plt.scatter(x, cv1 + cv2, label="Plus", s=2, c="r") plt.plot(x, np.abs(cv1 - cv2), label="Minus") plt.scatter(x, np.abs(cv1 - cv2), label="Minus", s=2, c="g") plt.xlabel("Week Number") plt.legend() plt.savefig("Images/Plots/week-cv-test.png", dpi=256, bbox_inches="tight") plt.close()
[docs]def model(): """Model the dependence of cloudiness on the moon phase. Notes ----- Cloudiness is plotted on the y-axis against the moon phase on the x-axis, and then np.linalg is used to find a quadratic model that fits this data. This model is used to remove the dependence of moon phase on the cloudiness data. """ loc = os.path.join(os.path.dirname(__file__), *["data", "phase.txt"]) data = [] # Reads in the phase and cloudiness information. # Literal eval makes it so that it reads the list as a list rather than # a string. with open(loc, "r") as f: for line in f: line = line.rstrip() b = ast.literal_eval(line) data.append(b) x = data[0] x.pop(0) x = np.asarray(x) # Just need to convert to floats including the nan. for i in range(1, len(data)): data[i] = [float(j) for j in data[i]] plt.ylim(0, 1.0) plt.ylabel("Average Cloudiness Fraction") plt.xlabel("Moon Phase") coeffs1 = [] coeffs2 = [] for i in range(1, len(data)): data[i].pop(0) # This line is to avoid hard coding in case I add more years of data. year = 2017 - len(data) + 1 + i # We do the fitting manually using the least squares algorithm. # This forces the fit to go through 0,0 since we do not pass a constant # coefficient, only the x^2 and x terms. A = np.vstack([x*x, x]).T b, c = np.linalg.lstsq(A, data[i])[0] coeffs1.append(b) coeffs2.append(c) plt.plot(x, data[i], label="Mean-" + str(year)) plt.plot(x, b*x*x + c*x, label="Fit-" + str(year)) # An average of the year wise fits. m = np.mean(coeffs1) n = np.mean(coeffs2) plt.plot(x, m*x*x + n*x, label="Mean Fit") # Writes the coefficients to a file for later use. cloud_loc = os.path.join(os.path.dirname(__file__), *["data", "clouds.txt"]) with open(cloud_loc, "w") as f: f.write(str(m) + "," + str(n)) plt.legend() plt.savefig("Images/temp.png", dpi=256, bbox_inches="tight") plt.close()
[docs]def histo(): """Generate weekly histograms of cloudiness values for 2016 and 2017. Notes ----- The cloudiness values for every image saved using :func:`~analyze` are loaded and collated into histograms containing one week each. Each week produces three histograms: one with only 2016 images, one with only 2017 images, and one with both on the same histogram. The 159 histograms are saved to "Images/Plots/Weeks". """ directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed"]) months = sorted(os.listdir(directory)) # Macs are dumb if ".DS_Store" in months: months.remove(".DS_Store") tweek = {} tweek["all"] = [[] for i in range(0, 53)] for month in months: # Gets the days that were analyzed for that month directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed", month]) days = sorted(os.listdir(directory)) # Strips out the year from the month year = month[:4] # Initializes a year if it doesn"t exist. if not year in tweek: tweek[year] = [[] for i in range(0, 53)] # Day 1 of the year, for week calculation. yearstart = year + "0101" day1 = coordinates.timestring_to_obj(yearstart, "r_ut000000s00000") # Reads the data for each day. for day in days: loc = directory + day f1 = open(loc, "r") # Strips off the .txt so we can make a Time object. day = day[:-4] for line in f1: # Splits out the value and file. line = line.rstrip() line = line.split(",") val = float(line[1]) name = line[0] # Moon phase calculation. phase = moon.moon_visible(day, name) # Ignores cloudiness with moon phase less than 0.2 if phase < 0.2: continue date = coordinates.timestring_to_obj(day, name) # Finds the difference since the beginning of the year to find # The week number. diff = date - day1 week = int(diff.value // 7) tweek["all"][week].append(val) tweek[year][week].append(val) split = 10 w = 0.61 / split # Starts by finding the divs because we want the width to be the same. num = np.amax(tweek["all"][2]) / w divs = np.asarray(range(0, int(num) + 1)) divs = divs * w # Blocks out the non major tick labels. Only keeps the major ones # Pre bar split (hence the modulo of the split) labels = list(divs) for j, label in enumerate(divs): if j % split != 0: labels[j] = "" else: labels[j] = round(label, 3) binstop = -16 * split histstop = binstop + 1 saveloc = "Images/Plots/Weeks" if not os.path.exists(saveloc): os.makedirs(saveloc) # Don"t want to recreate this every time. x = np.arange(0, divs[binstop], 0.05) chosen = {1:0, 2:0, 3:0, 4:0} sames = [] # Loops over each week (the i value) for i, val in enumerate(tweek["all"]): for year, value in tweek.items(): temp = np.asarray(tweek[year][i]) temp = np.delete(temp, np.where(temp < 0.061)) hist, bins = np.histogram(temp, bins=divs) # Sets the size wider than the previous to fit all the bins. # I shave off a lot of 0 value bins later as well in plotting. fig = plt.figure() fig.set_size_inches(11.4, 8.4) print("Week " + str(i+1)) plt.title("Week " + str(i+1)) plt.ylim(0, 900 / (split / 2)) plt.ylabel("Number of Occurrences") plt.xlabel("Cloudiness Relative to Mean") # Number of images used to make this histogram. num = len(temp) # Plots everything, histogram, and then the fitted data on top. if year == "all": # Changes the numbers for each year temp2 = np.asarray(tweek["2016"][i]) temp2 = np.delete(temp2, np.where(temp2 < 0.061)) num2 = len(temp2) num = num - num2 # For all, we plot everything, then 2016 on top for separation. plot = plt.bar(bins[:binstop], hist[:histstop], width=w, align="edge", tick_label=labels[:binstop], label="2017 (" + str(num) + ")") hist2, bins2 = np.histogram(temp2, bins=divs) plot = plt.bar(bins2[:binstop], hist2[:histstop], width=w, align="edge", tick_label=labels[:binstop], label="2016 (" + str(num2) + ")") else: # Plots just the year histogram. plot = plt.bar(bins[:binstop], hist[:histstop], width=w, align="edge", tick_label=labels[:binstop], label=year + " (" + str(num) + ")") coeffs, best = find_fit(temp, divs) print("Fit " + str(best + 1) + " Chosen") # Increments the dictionary counter chosen[best+1] = chosen[best+1] + 1 print(str(year) + ": " + str(coeffs)) # Finds the y func, then scales so it has the same area as the hist if len(coeffs) == 4: y = function_gp(coeffs, x) else: y = function_gg(coeffs, x) #print("Area: " + str(np.trapz(y,x))) scale = np.sum(hist) * w / np.trapz(y,x) y = y * scale # Plots the fit. plt.plot(x, y, color=(0, 1, 0, 1), label="Fit-" + str(best + 1)) plt.legend() plt.savefig("Images/Plots/Weeks/hist-" + str(i + 1) + "-" + year + ".png", dpi=256, bbox_inches="tight") plt.close() print("Saved: Week " + str(i+1)) print() print(chosen) nums = 0 for i, val in enumerate(sames): if val: nums = nums + 1 print("Trues: " + str(nums)) print("Falses: " + str(len(sames) - nums))
# Divs is the divisions to use, used in finding the guess parameters
[docs]def find_fit(data, divs): """Find the best fit to a weekly histogram. Parameters ---------- data : array_like The cloudiness data used in generating a histogram. divs : array_like The lower bounds on the divisions used in generating a histogram. Returns ------- tuple Tuple, where the first item is a list of coefficients corresponding to the best fit, and the second item is an identifying number that corresponds to which fit (a double gaussian or a Gaussian and Poisson hybrid) is the best. 0 and 1 correspond to a double Gaussian, and 2 and 3 correspond to a Gaussian and Poisson hybrid. """ split = 10 w = 0.61 / split # Assign a new array so we don"t edit the original temp = np.copy(data) hist, _ = np.histogram(temp, bins=divs) # The guess for the first mean is the maximum that occurs before the cutoff # at index 35 (about 2.44) # And the guess for the second mean is the maximum that occurs after. index = np.argmax(hist[:32]) guess = (index + 0.5) * w guess2 = (np.argmax(hist[32:]) + 32.5) * w # This block of code finds the first time the histogram falls below # Half the max, which gives us the half width at half maximum. # (Aprroximately). Since the curve is not smooth this actually # Isn"t great. Typically the histogram will drop below the half # Max before jumping up above it again for a few bins. less = np.where(hist < (hist[index] / 2), True, False) hwhm = (np.argmax(less[index:])) * w guesssigma = hwhm / (np.sqrt(2 * np.log(2))) # Guesses the frac paramter by comparing the heights of the two maximums. maximum = np.amax(hist) guessfrac = maximum/(maximum + 1 * np.amax(hist[35:])) guessfrac = np.arctanh(2 * guessfrac - 1) # Sets guessfrac to 0 if it breaks for some reason. if np.isnan(guessfrac) or np.isinf(guessfrac): guessfrac = 0 fitarr = [] coeffsarr = [] fit1 = fit_function(temp) fitarr.append(fit1.fun) coeffsarr.append(fit1.x) fit1 = fit_function(temp, init=[0.1, guess, 0.5, guess2, guessfrac]) fitarr.append(fit1.fun) coeffsarr.append(fit1.x) fit1 = fit_function(temp, func="gp") fitarr.append(fit1.fun) coeffsarr.append(fit1.x) fit1 = fit_function(temp, func="gp", init=[0.1, guess, guess2, guessfrac]) fitarr.append(fit1.fun) coeffsarr.append(fit1.x) fitarr = np.abs(np.asarray(fitarr)) best = np.argmin(fitarr) orig = best coeffs = np.copy(coeffsarr) while np.abs(coeffsarr[best][0]) < 0.002: print("Delta Found") fitarr = np.delete(fitarr, best) coeffsarr.pop(best) if fitarr.size == 0: best = np.argmin(fitarr) else: best = orig coeffsarr = np.copy(coeffs) coeffsarr[best][0] = 0.05 break return (coeffsarr[best], orig)
# Inverted the args for this, so they match those used by scipy"s minmize. # Minimize changes the coefficients making those the variables here.
[docs]def function_gp(params, x): """A Gaussian and Poisson hybrid function. Parameters ---------- params : array_like A list containing all of the defining parameters for the function. The first item is the standard deviation of the Gaussian, the second is the mean of the Gaussian, the third is the mean and variance of the Poisson, and the fourth is a fraction that defines how biased the function should be towards the Gaussian. For example, if the provided fraction is 1 then the function is only a Gaussian, whereas if the provided fraction is 0 the function is only the a Poisson. x : array_like The x coordinates to plug into the function. Returns ------- numpy.ndarray The function values corresponding to the input x values. Notes ----- The function is normalized so that the area under the curve from 0 to infinity is equal to 1. """ sigma = params[0] mu = params[1] lamb = params[2] frac = params[3] # A is the normalization constant, here changed because 0 is a hard cutoff. # So we normalize 0 to infinity rather than -infinity to infinity. # Trapz is the integration method using the trapezoid rule. x1 = np.arange(0, 400, 0.1) A = 1 / np.trapz(np.exp(-((x1 - mu) ** 2) / (2 * sigma * sigma)), x1) # This constrains frac to be between 0 and 1 using the hyperbolic tangent frac = (np.tanh(frac) + 1) / 2 p1 = frac * A * np.exp(-((x - mu) ** 2) / (2 * sigma * sigma)) p2 = (1 - frac) * (lamb ** x / special.factorial(x)) * np.exp(-lamb) return p1 + p2
[docs]def function_gg(params, x): """A double Gaussian hybrid function. Parameters ---------- params : array_like A list containing all of the defining parameters for the function. The first item is the standard deviation of the first (lower) Gaussian, the second is the mean of the first Gaussian, the third is the standard deviation of the second (higher) Gaussian, the fourth is the mean of the second Gaussian, and the fifth is a fraction that defines how biased the function should be towards the first Gaussian. For example, if the provided fraction is 1 then the function is only the first Gaussian, whereas if the provided fraction is 0 the function is only the second Gaussian. x : array_like The x coordinates to plug into the function. Returns ------- numpy.ndarray The function values corresponding to the input x values. Notes ----- The function is normalized so that the area under the curve from 0 to infinity is equal to 1. """ sigma1 = params[0] mu1 = params[1] sigma2 = params[2] mu2 = params[3] frac = params[4] # A is the normalization constant, here changed because 0 is a hard cutoff. # So we normalize 0 to infinity rather than -infinity to infinity. # Trapz is the integration method using the trapezoid rule. x1 = np.arange(0, 400, 0.1) A = 1 / np.trapz(np.exp(-((x1 - mu1) ** 2) / (2 * sigma1 * sigma1)), x1) B = 1 / np.trapz(np.exp(-((x1 - mu2) ** 2) / (2 * sigma2 * sigma2)), x1) # This constrains frac to be between 0 and 1 using the hyperbolic tangent frac = (np.tanh(frac) + 1) / 2 p1 = frac * A * np.exp(-((x - mu1) ** 2) / (2 * sigma1 * sigma1)) p2 = (1 - frac) * B * np.exp(-((x - mu2) ** 2) / (2 * sigma2 * sigma2)) return p1 + p2
def likelihood_gg(params, data): """The log likelihood value corresponding to the double Gaussian hybrid function. Parameters ---------- params : array_like A list containing all of the defining parameters for the function. The first item is the standard deviation of the first (lower) Gaussian, the second is the mean of the first Gaussian, the third is the standard deviation of the second (higher) Gaussian, the fourth is the mean of the second Gaussian, and the fifth is a fraction that defines how biased the function should be towards the first Gaussian. For example, if frac = 1 then the function is only the first Gaussian, whereas if frac = 0 the function is only the second Gaussian. data : array_like The x coordinates to plug into the function. Returns ------- float The log likelihood value. Notes ----- The log likelihood of a function is calulated using a chi-squared analysis, and thus is the negative sum of the natural logarithm of the function values. """ # In chi-squared there"s a 2 in front but since we"re minimizing I"ve # dropped it. chi = -np.sum(np.log(function_gg(params, data))) return chi def likelihood_gp(params, data): """The log likelihood value corresponding to the Gaussian and Poisson hybrid function. params : array_like A list containing all of the defining parameters for the function. The first item is the standard deviation of the Gaussian, the second is the mean of the Gaussian, the third is the mean and standard deviation of the Poisson, and the fourth is a fraction that defines how biased the function should be towards the Gaussian. For example, if frac = 1 then the function is only a Gaussian, whereas if frac = 0 the function is only a Poisson. data : array_like The x coordinates to plug into the function. Returns ------- float The log likelihood value. Notes ----- The log likelihood of a function is calulated using a chi-squared analysis, and thus is the negative sum of the natural logarithm of the function values. """ # In chi-squared there"s a 2 in front but since we"re minimizing I"ve # dropped it. chi = -np.sum(np.log(function_gp(params, data))) return chi # Fits the model to the data # I abstracted this in case I need it somewhere else.
[docs]def fit_function(xdata, func="gg", init=None): """Fit a function to a histogram. Parameters ---------- xdata : array_like The cloudiness data used to compute a histogram. func : str, optional Which function to fit. "gg" corresponds to a double Gaussian hybrid, and "gp" corresponds to a Gaussian and Poisson hybrid function. Defaults to "gg". init : list, optional A list of initial guesses for the function fitting. Defaults to None. If None, uses an initial guess of [0.1,0.5,0.5,3,0] for a double Gaussian hybrid function, and [0.1,0.5,3,0] for a Gaussian and Poisson hybrid function. Returns ------- scipy.optimize.OptimizeResult An object corresponding to the fit function. """ if func == "gg": likelihood = likelihood_gg if init is None: init = [0.1,0.5,0.5,3,0] else: likelihood = likelihood_gp if init is None: init = [0.1,0.5,3,0] # Default for maxiter is N * 200 but that"s not enough in this case so we # need to specify a higher value fit = optimize.minimize(likelihood, x0=init, args=xdata, method="Nelder-Mead", options={"disp":False, "maxiter":1200}) return fit
# Saves the cloudiness data as a csv file.
[docs]def to_csv(): """Convert the saved daily cloudiness data to a CSV file. Notes ----- The cloudiness values for every image saved using :func:`~analyze` are loaded and converted to a single CSV file: data.csv. The CSV file has 5 headers: Date & Time, Year, Week Number, Normalized Time after Sunset and Cloudiness Relative to the Mean. """ directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed"]) months = sorted(os.listdir(directory)) # Macs are dumb if ".DS_Store" in months: months.remove(".DS_Store") # Temp other stuff normalnum = 50 normaldiv = 1. / normalnum data = [] for month in months: # Gets the days that were analyzed for that month directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed", month]) days = sorted(os.listdir(directory)) # Strips out the year from the month year = month[:4] # Day 1 of the year, for week calculation. yearstart = year + "0101" day1 = coordinates.timestring_to_obj(yearstart, "r_ut000000s00000") # Reads the data for each day. for day in days: loc = os.path.join(directory, day) f1 = open(loc, "r") # Strips off the .txt so we can make a Time object. day = day[:-4] for line in f1: linedata = [] # Splits out the value and file. line = line.rstrip() line = line.split(",") val = float(line[1]) name = line[0] # Moon phase calculation. phase = moon.moon_visible(day, name) # Ignores cloudiness with moon phase less than 0.2 if phase < 0.2: continue date = coordinates.timestring_to_obj(day, name) # Finds the difference since the beginning of the year to find # The week number. diff = date - day1 week = int(diff.value // 7) # Sunset time calculation. # 12 hours after sunset for 50 bins = 0.01 of a day per bin. formatdate = day[:4] + "/" + day[4:6] + "/" + day[6:] time = name[4:6] + ":" + name[6:8] + ":" + name[8:10] formatdate = formatdate + " " + time linedata.append(formatdate) linedata.append(year) linedata.append(week) # Sets the date of calculation. camera.date = formatdate date = ephem.Date(formatdate) # Calculates the previous setting and next rising of the sun. sun = ephem.Sun() setting = camera.previous_setting(sun, use_center=True) rising = camera.next_rising(sun, use_center=True) # Finds the difference and bins it into the correct bin. diff = date - setting length = rising - setting normalize = int((diff / length) // normaldiv) linedata.append(normalize) linedata.append(val) data.append(np.asarray(linedata)) data = np.asarray(data) d2 = pd.DataFrame(data, columns=["Date & Time", "Year", "Week Number", "Normalized Time after Sunset", "Cloudiness Relative to the Mean"]) d2.to_csv("data.csv")
if __name__ == "__main__": #analyze() plot(["2014", "2015", "2016", "2017"])