"""A module containing various methods for analyzing the cloudiness of images.
This module is designed to be used in conjunction with downloaded images to
perform statistical analyses and plotting. Methods are provided for
analyzing months of data and saving the cloudiness values to a file. It is
then possible to take this data and plot it in various forms, across a variety
of different plots. The cloudiness can be plotted as a histogram, and two
different models can be fit to the histograms. These two models are a double
Gaussian and a Gaussian-Poisson hybrid.
"""
import os
import time
import ast
from scipy import optimize
from scipy import special
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ephem
import io_util
import moon
import histogram
import coordinates
import image
[docs]def get_latest_analyzed():
"""Find the latest analyzed date.
Returns
-------
str
The latest date analyzed in yyyymmdd format.
Notes
-----
The latest date analyzed is found by searching through Data/ for the
latest month that has been analyzed, and then searching through that month
for the latest day that has been analyzed.
"""
# Gets the downloaded months
directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed"])
months = sorted(os.listdir(directory))
# If no dates have been analyzed yet return 0
if not months or months[-1] == ".DS_Store":
return 0
# Gets the singular days that have been analyzed
directory = os.path.join(directory, months[-1])
days = sorted(os.listdir(directory))
# If no days for this month have been analyzed yet return the first day.
if not days:
start = months[-1] + "01"
return start
# Return the final day analyzed. We will reanalyze this completely,
# in case the analysis crashed mid day.
# Second slice slices off the .txt
start = days[-1][:-4]
return start
[docs]def analyze():
"""Determine cloudiness of images for every day from January 1, 2017 onward.
See Also
--------
histogram.cloudiness : Calculate the cloudiness of a histogram.
Notes
-----
For each night, all images taken that day are downloaded. For every night
analyzed, the cloudiness values for every image taken during that night
are written to "Data/yyyymm/yyyymmdd.txt" where yyyymmdd corresponds to the
date analyzed.
"""
t1 = time.perf_counter()
link = "http://kpasca-archives.tuc.noao.edu/"
rlink = io_util.download_url(link)
if rlink is None:
print("Getting dates failed.")
return
html = rlink.text
# This parser was designed to work with image names but it works the same
# for dates so why make a new one?
parser = io_util.DateHTMLParser()
parser.feed(html)
parser.close()
datelinks = parser.data
parser.clear_data()
# 20080306 is the first date the camera is located correctly
# 20080430 is the first date the images are printed correctly
startdate = 20100101
# Reads in the model coefficients.
cloud_loc = os.path.join(os.path.dirname(__file__), *["data", "clouds.txt"])
with open(cloud_loc, "r") as f:
for line in f:
line = line.rstrip()
line = line.split(",")
b = float(line[0])
c = float(line[1])
fails = []
for date in datelinks:
# Strips the /index from the date so we have just the date
d = date[:8]
# The month of the date for organization purposes.
month = d[:6]
# This makes it so we start with the date we left off with.
if int(d) < startdate:
continue
if not 20140726 <= int(d) <= 20141231:
continue
# Print a new line to differentiate dates.
print()
print(d)
rdate = io_util.download_url(link + date)
# If we fail to get the data for the date, log it and move to the next.
if rdate is None:
fails.append(date)
print("Failed to retrieve data for " + date)
continue
# Extracting the image names.
htmldate = rdate.text
parser.feed(htmldate)
parser.close()
imagenames = parser.data
parser.clear_data()
monthloc = os.path.join(os.path.dirname(__file__), *["data", "analyzed", month])
if not os.path.exists(monthloc):
os.makedirs(monthloc)
datafile = os.path.join(monthloc, d + ".txt")
with open(datafile, "w") as f:
for name in imagenames:
# This ignores all the blue filter images, since we don"t want
# to process those.
if name[:1] == "b":
continue
# We want to ignore the all image animations
if name == "allblue.gif" or name == "allred.gif":
continue
# This creates an AllSkyImage we use for subsequent methods.
# This is the path the image is saved to.
# We need to check if it exists first, just in case.
path = os.path.join(os.path.dirname(__file__), *["Images", "Original", "KPNO", d, name])
# Download the image, if it hasn"t been downloaded
if not os.path.isfile(path):
io_util.download_image(d, name)
img = image.load_image(name, d, "KPNO")
# Finds the moon and the sun in the image.
# We only process images where the moon is visble (alt > 0)
# And the sun is low enough to not wash out the image
# (sun alt < -17)
moonalt = moon.find_moon(img)[2]
sunalt = moon.find_sun(img)[0]
# Checks that the analysis conditions are met.
if moonalt > 0 and sunalt < -17:
# Then we make a histogram and a "cloudiness fraction"
# Generates the moon mask.
mask = moon.moon_mask(img)
hist = histogram.generate_histogram(img, mask)
frac = histogram.cloudiness(hist)
# Correction for moon phase.
phase = moon.moon_phase(img)
val = b * phase * phase + c * phase
frac = frac / val
# Then we save the cloudiness fraction to the file for that
# date.
dataline = name + "," + str(frac) + "\n"
f.write(dataline)
print("Analyzed: " + img.formatdate)
# Deletes an image if we didn"t analyze it.
else:
os.remove(path)
t2 = time.perf_counter()
print(t2 - t1)
print("The following dates failed to download: " + str(fails))
[docs]def month_plot():
"""Generate a plot of nightly cloudiness for each month.
Notes
-----
The cloudiness values for every image saved using :func:`~analyze` are loaded. These
values are plotted, with all images for a given month on a single plot.
Plots are saved in "Images/Plots/" with the filename "scatter-`month`.png".
"""
# Gets the downloaded months
directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed"])
# If the data directory doesn"t exist we should exit here.
if not os.path.exists(directory):
print("No data found.")
return
months = sorted(os.listdir(directory))
# Macs are dumb
if ".DS_Store" in months:
months.remove(".DS_Store")
for month in months:
# Gets the days that were analyzed for that month
directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed", month])
days = sorted(os.listdir(directory))
data = []
illum = []
x = []
# Reads the data for each day.
for day in days:
d1 = directory + day
f1 = open(d1, "r")
# Strips off the .txt so we can make a Time object.
day = day[:-4]
for line in f1:
line = line.rstrip()
line = line.split(",")
# Gets the plot date for the timestring object.
d = coordinates.timestring_to_obj(day, line[0]).plot_date
data.append(float(line[1]))
x.append(d)
vis = moon.moon_visible(day, line[0])
illum.append(vis)
f1.close()
# Sets up the plot
fig, ax = plt.subplots()
fig.set_size_inches(20, 5)
ax.plot_date(x, data, xdate=True, markersize=1)
ax.plot_date(x, illum, xdate=True, markersize=1, color="red")
# These are the x divisions which indicate the start of each day.
xt = []
start = int(days[0][:-4])
end = int(days[-1][:-4]) + 1
for i in range(start, end):
# Finds the plot_date for the start of the day.
x1 = coordinates.timestring_to_obj(str(i),
"r_ut000000s00000").plot_date
xt.append(x1)
# Need the right end to be the start of next month.
# Adding 100 increments the month number.
# First if statement checks if this is december in which case we need
# to roll over the year and reset the month.
if str(start)[4:6] == "12":
end = str(start + 8900)
else:
end = str(start + 100)
d2 = coordinates.timestring_to_obj(end, "r_ut000000s00000").plot_date
# Sets the limits and division ticks.
ax.set_ylim(0, 1.0)
ax.set_xlim(xt[0], d2)
ax.set_xticks(xt)
ax.xaxis.grid(True)
ax.xaxis.set_ticklabels([])
# Sets the axis labels and saves.
ax.set_ylabel("Cloudiness Fraction")
# Puts the date in a nice form for the plot axis.
date = month[-2:] + "/01/" + month[:-2]
ax.set_xlabel("Time After " + date)
plotloc = os.path.join(os.path.dirname(__file__), *["Images", "Plots"])
if not os.path.exists(plotloc):
os.makedirs(plotloc)
plt.savefig(plotloc + "scatter-" + month + ".png",
dpi=256, bbox_inches="tight")
plt.close()
# Sets up a pyephem object for the camera.
camera = ephem.Observer()
camera.lat = "31.959417"
camera.lon = "-111.598583"
camera.elevation = 2120
camera.horizon = "-17"
[docs]def plot(years=["2015","2016","2017"], fit_histograms=False):
"""Generate various cloudiness plots and save them to Images/Plots.
Parameters
----------
years : list, optional
List of years to include in the plots. Defaults to
["2015","2016","2017"].
fit_histograms : bool, optional
To fit the weekly histograms and plot the resulting fit variables
or not. Defaults to False.
Notes
-----
The cloudiness values for every image saved using :func:`~analyze` are loaded.
These values are further analyzed to create new plots. The generated
plots include plots of Cloudiness vs Moon Phase, Cloudiness vs Hours since
sunset, Cloudiness vs Normalized time after sunset, Cloudiness vs Hours
before sunrise, and Cloudiness vs Week Number. These plots are saved to
Images/Plots. If `fit_histograms` is true, the weekly histograms will be
fit by a double Gaussian or Gaussian-Poisson hybrid function, after which
plots of week number versus the various fit parameters will be made and
saved as well.
"""
# Gets the downloaded months
directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed"])
# If the data directory doesn"t exist we should exit here.
if not os.path.exists(directory):
print("No data found.")
return
# Temporary arrays for moon phase plots
phasenum = 50
phasediv = 1 / phasenum # Phase ranges from 0-1 so divisions is 1/num divs
tphase = [[] for i in range(0, phasenum)]
# Temporary arrays for sunset plots
sunsetnum = 50
sunsetdiv = 0.5 / sunsetnum # Up to 12 hours after sunset = 0.5 day / divs
tsunset = [[] for i in range(0, sunsetnum)]
# Temporary arrays for sunrise plots
sunrisenum = 50
sunrisediv = 0.5 / sunrisenum # 12 hours before sunrise = 0.5 day / divs
tsunrise = [[] for i in range(0, sunrisenum)]
# Temp other stuff
normalnum = 50
normaldiv = 1. / normalnum
tnormal = [[] for i in range(0, normalnum)]
# Temporary arrays for week plots
tweek = [[] for i in range(0, 53)]
tmoon = [[] for i in range(0, 53)]
months = sorted(os.listdir(directory))
# Macs are dumb
if ".DS_Store" in months:
months.remove(".DS_Store")
for month in months:
# Gets the days that were analyzed for that month
directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed", month])
days = sorted(os.listdir(directory))
# Strips out the year from the month
year = month[:4]
if not year in years:
continue
# Day 1 of the year, for week calculation.
yearstart = year + "0101"
day1 = coordinates.timestring_to_obj(yearstart, "r_ut000000s00000")
# Reads the data for each day.
for day in days:
loc = os.path.join(directory, day)
f1 = open(loc, "r")
# Strips off the .txt so we can make a Time object.
day = day[:-4]
for line in f1:
# Splits out the value and file.
line = line.rstrip()
line = line.split(",")
val = float(line[1])
name = line[0]
# A phantom image for calculation that doesn"t actually have
# any data in it.
img = image.AllSkyImage(name, day, "KPNO", None)
# Moon phase calculation.
phase = moon.moon_phase(img)
# Ignores cloudiness with moon phase less than 0.2
if phase < 0.2:
continue
b = int(phase // phasediv)
tphase[b].append(val)
formatdate = img.formatdate
# Sets the date of calculation.
camera.date = formatdate
date = ephem.Date(formatdate)
# Calculates the previous setting and next rising of the sun.
sun = ephem.Sun()
setting = camera.previous_setting(sun, use_center=True)
rising = camera.next_rising(sun, use_center=True)
# Finds the difference and bins it into the correct bin.
diff = date - setting
length = rising - setting
b = int(diff // sunsetdiv)
b2 = int((diff / length) // normaldiv)
tsunset[b].append(val)
tnormal[b2].append(val)
# Finds the difference and bins it into the correct bin.
diff = rising - date
b = int(diff // sunrisediv)
tsunrise[b].append(val)
# Week of the year calculation.
# Gets the date object for the image
date = img.time
# Finds the difference since the beginning of the year to find
# The week number.
diff = date - day1
week = int(diff.value // 7)
tweek[week].append(val)
tmoon[week].append(phase)
# Prints the progress if this is the final month of the year and we
# just finished it.
if month[4:] == "12":
print("Collated " + year + " data")
percents = ["25", "50", "75"]
colors = [(1, 0, 0, 1), (0, 0, 1, 1), (1, 1, 0, 1)]
# Nan array for if there"s no data for that bin.
nanarray = np.asarray([[float("nan"), float("nan"), float("nan")]])
# This method sets up the plots. using the given x array and the data set
# it finds the percentile values for each bin division.
def setup_plot(x, dataset):
# We only need this so the axis shape works out. We delete it later.
data = np.asarray([[0, 0, 0]])
for i, val in enumerate(dataset):
temp = np.asarray(val)
# Percentile returns an array of each of the three values.
# We"re creating a data array where each column is all the
# percentile values for each bin value.
if temp.size == 0:
data = np.append(data, nanarray, axis=0)
else:
d = np.reshape(np.percentile(temp, [25, 50, 75]), (1, 3))
data = np.append(data, d, axis=0)
# Deletes the first 0,0,0 array.
data = np.delete(data, 0, 0)
plt.ylim(0, 4.0)
# This sets up the individual plots. You can just pass the data
# multiararay but I need each line to be individually labeled.
for i, val in enumerate(percents):
plt.plot(x, data[0:data.shape[0], i], label=val + "%",
color=colors[i])
# This fills between the lowest and top percentiles.
plt.fill_between(x, data[0:data.shape[0], 0],
data[0:data.shape[0], -1], color=(1, 0.5, 0, 0.5))
plt.legend()
# Plotting and averaging code
# Moon phase
plt.ylabel("Cloudiness Relative to Mean")
plt.xlabel("Moon Phase")
x = np.asarray((range(0, phasenum)))
x = x * phasediv + (phasediv / 2.)
setup_plot(x, tphase)
plt.savefig("Images/Plots/phase.png", dpi=256, bbox_inches="tight")
plt.close()
# Sunset
plt.ylabel("Cloudiness Relative to Mean")
plt.xlabel("Hours since sunset")
x = np.asarray((range(0, sunsetnum)))
x = x * sunsetdiv * 24 + (sunsetdiv * 12)
setup_plot(x, tsunset)
plt.savefig("Images/Plots/sunset.png", dpi=256, bbox_inches="tight")
plt.close()
# Normalized
plt.ylabel("Cloudiness Relative to Mean")
plt.xlabel("Normalized time after sunset")
x = np.asarray((range(0, normalnum)))
x = x * normaldiv + (normaldiv / 2.)
setup_plot(x, tnormal)
plt.savefig("Images/Plots/normalized-0.png", dpi=256, bbox_inches="tight")
plt.close()
# Sunrise
x = np.asarray((range(0, sunrisenum)))
x = x * sunrisediv * 24 + (sunrisediv * 12)
# Sets up the plot before we plot the things
plt.ylabel("Cloudiness Relative to Mean")
plt.xlabel("Hours before sunrise")
setup_plot(x, tsunrise)
plt.savefig("Images/Plots/sunrise.png", dpi=256, bbox_inches="tight")
plt.close()
# Week
x = np.asarray((range(1, 54)))
#plt.ylabel("Cloudiness Relative to Mean")
plt.xlabel("Week Number")
plt.ylabel("Cloudiness Relative to Mean")
# Moon phase averages.
moon_avgs = []
num_imgs = []
for i, val in enumerate(tweek):
moon_avgs.append(np.mean(tmoon[i]))
num_imgs.append(len(val))
setup_plot(x, tweek)
#Reads in the csv file using pandas.
domedata = pd.read_csv("daily-2007-2017.csv")
closedict = dict.fromkeys(years)
for y in years:
name = "Y" + y
closedict[y] = domedata.get(name).values
tclose = [[] for i in range(0, 53)]
closeav = []
# i represents the day number of the year in 2016. If it is larger than 59
# (31 + 28) we need to increase the day by 1 to account for the leap day.
# This is because the domedata csv does not include February 29.
# For example, March 1st is considered day 60, when in 2016 it
# is actually day 61.
for j, val in enumerate(closedict["2016"]):
if j >= 60:
i = j + 1
else:
i = j
week1 = j // 7
week2 = i // 7
for key, val in closedict.items():
# We need to append to a different week if it's a leap year, due
# to the leap day changing the day number of the year after
# February.
week = week2 if int(key) % 4 == 0 else week1
tclose[week].append(val[j])
for i, val in enumerate(tclose):
closeav.append(np.mean(val))
# Alternative additional data.
#plt.plot(x, moons, label="Moon Phase", color=(0, 1, 0, 1))
#num_imgs = np.asarray(num_imgs)
#num_imgs = num_imgs * 1 / (np.amax(num_imgs))
#plt.plot(x, num_imgs, label="Normalized number of images",
#color=(0, 1, 0, 1))
plt.plot(x, closeav, label="Average Closed Fraction", color=(0,1,0,1))
plt.legend()
plt.savefig("Images/Plots/week.png", dpi=256, bbox_inches="tight")
plt.close()
if fit_histograms:
data = np.asarray([[0, 0, 0, 0, 0]])
split = 10
w = 0.61 / split
num = np.amax(tweek[2]) / w
divs = np.asarray(range(0, int(num) + 1))
divs = divs * w
for i, val in enumerate(tweek):
temp = np.asarray(val)
# Percentile returns an array of each of the three values.
# We"re creating a data array where each column is all the
# percentile values for each bin value.
if temp.size == 0:
data = np.append(data, nanarray, axis=0)
else:
# Passes the data to the fitting method.
temp = np.delete(temp, np.where(temp < 0.061))
d, _ = find_fit(temp, divs)
# Inserts a the sqaure root of the lambda for the mu-2 if the fit
# Is a gaussian-poisson combo.
if len(d) < 5:
d = np.asarray(d)
d = np.insert(d, 3, np.sqrt(d[2]))
d = d.reshape(1,5)
data = np.append(data, d, axis=0)
# Deletes the first 0,0,0 array.
data = np.delete(data, 0, 0)
# Code to plot the fits, abstracted because it was getting a little
# cumbersome to copy paste.
def fit_plot(x, data, name):
loc = "Images/Plots/week-" + name.lower() + ".png"
plt.plot(x, data, label=name)
plt.scatter(x, data, label=name, s=2, c="r")
plt.xlabel("Week Number")
plt.ylabel("Cloudiness Relative to Mean")
plt.legend()
plt.savefig(loc, dpi=256, bbox_inches="tight")
plt.close()
# This code plots the variables individually
sigma1 = data[0:data.shape[0], 0]
fit_plot(x, sigma1, "Sigma-1")
mu1 = data[0:data.shape[0], 1]
fit_plot(x, mu1, "Mu-1")
cv1 = sigma1 / mu1
fit_plot(x, cv1, "CV-1")
sigma2 = data[0:data.shape[0], 2]
fit_plot(x, sigma2, "Sigma-2")
mu2 = data[0:data.shape[0], 3]
print(np.argmin(mu2))
mu2 = np.where(mu2 < -100, mu1, mu2)
fit_plot(x, mu2, "Mu-2")
cv2 = sigma2 / mu2
fit_plot(x, cv2, "CV-2")
frac = data[0:data.shape[0], 4]
frac = (np.tanh(frac) + 1) / 2
fit_plot(x, frac, "Frac")
plt.plot(x, mu1 + mu2, label="Plus")
plt.scatter(x, mu1 + mu2, label="Plus", s=2, c="r")
plt.plot(x, np.abs(mu1 - mu2), label="Minus")
plt.scatter(x, np.abs(mu1 - mu2), label="Minus", s=2, c="g")
plt.xlabel("Week Number")
plt.legend()
plt.savefig("Images/Plots/week-mu-test.png", dpi=256, bbox_inches="tight")
plt.close()
plt.plot(x, sigma1 + sigma2, label="Plus")
plt.scatter(x, sigma1 + sigma2, label="Plus", s=2, c="r")
plt.plot(x, np.abs(sigma1 - sigma2), label="Minus")
plt.scatter(x, np.abs(sigma1 - sigma2), label="Minus", s=2, c="g")
plt.xlabel("Week Number")
plt.legend()
plt.savefig("Images/Plots/week-sigma-test.png", dpi=256, bbox_inches="tight")
plt.close()
plt.plot(x, cv1 + cv2, label="Plus")
plt.scatter(x, cv1 + cv2, label="Plus", s=2, c="r")
plt.plot(x, np.abs(cv1 - cv2), label="Minus")
plt.scatter(x, np.abs(cv1 - cv2), label="Minus", s=2, c="g")
plt.xlabel("Week Number")
plt.legend()
plt.savefig("Images/Plots/week-cv-test.png", dpi=256, bbox_inches="tight")
plt.close()
[docs]def model():
"""Model the dependence of cloudiness on the moon phase.
Notes
-----
Cloudiness is plotted on the y-axis against the moon phase on the x-axis,
and then np.linalg is used to find a quadratic model that fits this data.
This model is used to remove the dependence of moon phase on the
cloudiness data.
"""
loc = os.path.join(os.path.dirname(__file__), *["data", "phase.txt"])
data = []
# Reads in the phase and cloudiness information.
# Literal eval makes it so that it reads the list as a list rather than
# a string.
with open(loc, "r") as f:
for line in f:
line = line.rstrip()
b = ast.literal_eval(line)
data.append(b)
x = data[0]
x.pop(0)
x = np.asarray(x)
# Just need to convert to floats including the nan.
for i in range(1, len(data)):
data[i] = [float(j) for j in data[i]]
plt.ylim(0, 1.0)
plt.ylabel("Average Cloudiness Fraction")
plt.xlabel("Moon Phase")
coeffs1 = []
coeffs2 = []
for i in range(1, len(data)):
data[i].pop(0)
# This line is to avoid hard coding in case I add more years of data.
year = 2017 - len(data) + 1 + i
# We do the fitting manually using the least squares algorithm.
# This forces the fit to go through 0,0 since we do not pass a constant
# coefficient, only the x^2 and x terms.
A = np.vstack([x*x, x]).T
b, c = np.linalg.lstsq(A, data[i])[0]
coeffs1.append(b)
coeffs2.append(c)
plt.plot(x, data[i], label="Mean-" + str(year))
plt.plot(x, b*x*x + c*x, label="Fit-" + str(year))
# An average of the year wise fits.
m = np.mean(coeffs1)
n = np.mean(coeffs2)
plt.plot(x, m*x*x + n*x, label="Mean Fit")
# Writes the coefficients to a file for later use.
cloud_loc = os.path.join(os.path.dirname(__file__), *["data", "clouds.txt"])
with open(cloud_loc, "w") as f:
f.write(str(m) + "," + str(n))
plt.legend()
plt.savefig("Images/temp.png", dpi=256, bbox_inches="tight")
plt.close()
[docs]def histo():
"""Generate weekly histograms of cloudiness values for 2016 and 2017.
Notes
-----
The cloudiness values for every image saved using :func:`~analyze` are loaded and
collated into histograms containing one week each. Each week produces
three histograms: one with only 2016 images, one with only 2017
images, and one with both on the same histogram. The 159 histograms are
saved to "Images/Plots/Weeks".
"""
directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed"])
months = sorted(os.listdir(directory))
# Macs are dumb
if ".DS_Store" in months:
months.remove(".DS_Store")
tweek = {}
tweek["all"] = [[] for i in range(0, 53)]
for month in months:
# Gets the days that were analyzed for that month
directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed", month])
days = sorted(os.listdir(directory))
# Strips out the year from the month
year = month[:4]
# Initializes a year if it doesn"t exist.
if not year in tweek:
tweek[year] = [[] for i in range(0, 53)]
# Day 1 of the year, for week calculation.
yearstart = year + "0101"
day1 = coordinates.timestring_to_obj(yearstart, "r_ut000000s00000")
# Reads the data for each day.
for day in days:
loc = directory + day
f1 = open(loc, "r")
# Strips off the .txt so we can make a Time object.
day = day[:-4]
for line in f1:
# Splits out the value and file.
line = line.rstrip()
line = line.split(",")
val = float(line[1])
name = line[0]
# Moon phase calculation.
phase = moon.moon_visible(day, name)
# Ignores cloudiness with moon phase less than 0.2
if phase < 0.2:
continue
date = coordinates.timestring_to_obj(day, name)
# Finds the difference since the beginning of the year to find
# The week number.
diff = date - day1
week = int(diff.value // 7)
tweek["all"][week].append(val)
tweek[year][week].append(val)
split = 10
w = 0.61 / split
# Starts by finding the divs because we want the width to be the same.
num = np.amax(tweek["all"][2]) / w
divs = np.asarray(range(0, int(num) + 1))
divs = divs * w
# Blocks out the non major tick labels. Only keeps the major ones
# Pre bar split (hence the modulo of the split)
labels = list(divs)
for j, label in enumerate(divs):
if j % split != 0:
labels[j] = ""
else:
labels[j] = round(label, 3)
binstop = -16 * split
histstop = binstop + 1
saveloc = "Images/Plots/Weeks"
if not os.path.exists(saveloc):
os.makedirs(saveloc)
# Don"t want to recreate this every time.
x = np.arange(0, divs[binstop], 0.05)
chosen = {1:0, 2:0, 3:0, 4:0}
sames = []
# Loops over each week (the i value)
for i, val in enumerate(tweek["all"]):
for year, value in tweek.items():
temp = np.asarray(tweek[year][i])
temp = np.delete(temp, np.where(temp < 0.061))
hist, bins = np.histogram(temp, bins=divs)
# Sets the size wider than the previous to fit all the bins.
# I shave off a lot of 0 value bins later as well in plotting.
fig = plt.figure()
fig.set_size_inches(11.4, 8.4)
print("Week " + str(i+1))
plt.title("Week " + str(i+1))
plt.ylim(0, 900 / (split / 2))
plt.ylabel("Number of Occurrences")
plt.xlabel("Cloudiness Relative to Mean")
# Number of images used to make this histogram.
num = len(temp)
# Plots everything, histogram, and then the fitted data on top.
if year == "all":
# Changes the numbers for each year
temp2 = np.asarray(tweek["2016"][i])
temp2 = np.delete(temp2, np.where(temp2 < 0.061))
num2 = len(temp2)
num = num - num2
# For all, we plot everything, then 2016 on top for separation.
plot = plt.bar(bins[:binstop], hist[:histstop], width=w,
align="edge", tick_label=labels[:binstop],
label="2017 (" + str(num) + ")")
hist2, bins2 = np.histogram(temp2, bins=divs)
plot = plt.bar(bins2[:binstop], hist2[:histstop], width=w,
align="edge", tick_label=labels[:binstop],
label="2016 (" + str(num2) + ")")
else:
# Plots just the year histogram.
plot = plt.bar(bins[:binstop], hist[:histstop], width=w,
align="edge", tick_label=labels[:binstop],
label=year + " (" + str(num) + ")")
coeffs, best = find_fit(temp, divs)
print("Fit " + str(best + 1) + " Chosen")
# Increments the dictionary counter
chosen[best+1] = chosen[best+1] + 1
print(str(year) + ": " + str(coeffs))
# Finds the y func, then scales so it has the same area as the hist
if len(coeffs) == 4:
y = function_gp(coeffs, x)
else:
y = function_gg(coeffs, x)
#print("Area: " + str(np.trapz(y,x)))
scale = np.sum(hist) * w / np.trapz(y,x)
y = y * scale
# Plots the fit.
plt.plot(x, y, color=(0, 1, 0, 1), label="Fit-" + str(best + 1))
plt.legend()
plt.savefig("Images/Plots/Weeks/hist-" + str(i + 1) + "-" + year + ".png",
dpi=256, bbox_inches="tight")
plt.close()
print("Saved: Week " + str(i+1))
print()
print(chosen)
nums = 0
for i, val in enumerate(sames):
if val:
nums = nums + 1
print("Trues: " + str(nums))
print("Falses: " + str(len(sames) - nums))
# Divs is the divisions to use, used in finding the guess parameters
[docs]def find_fit(data, divs):
"""Find the best fit to a weekly histogram.
Parameters
----------
data : array_like
The cloudiness data used in generating a histogram.
divs : array_like
The lower bounds on the divisions used in generating a histogram.
Returns
-------
tuple
Tuple, where the first item is a list of coefficients corresponding
to the best fit, and the second item is an identifying number that
corresponds to which fit (a double gaussian or a Gaussian and Poisson
hybrid) is the best. 0 and 1 correspond to a double Gaussian, and 2 and
3 correspond to a Gaussian and Poisson hybrid.
"""
split = 10
w = 0.61 / split
# Assign a new array so we don"t edit the original
temp = np.copy(data)
hist, _ = np.histogram(temp, bins=divs)
# The guess for the first mean is the maximum that occurs before the cutoff
# at index 35 (about 2.44)
# And the guess for the second mean is the maximum that occurs after.
index = np.argmax(hist[:32])
guess = (index + 0.5) * w
guess2 = (np.argmax(hist[32:]) + 32.5) * w
# This block of code finds the first time the histogram falls below
# Half the max, which gives us the half width at half maximum.
# (Aprroximately). Since the curve is not smooth this actually
# Isn"t great. Typically the histogram will drop below the half
# Max before jumping up above it again for a few bins.
less = np.where(hist < (hist[index] / 2), True, False)
hwhm = (np.argmax(less[index:])) * w
guesssigma = hwhm / (np.sqrt(2 * np.log(2)))
# Guesses the frac paramter by comparing the heights of the two maximums.
maximum = np.amax(hist)
guessfrac = maximum/(maximum + 1 * np.amax(hist[35:]))
guessfrac = np.arctanh(2 * guessfrac - 1)
# Sets guessfrac to 0 if it breaks for some reason.
if np.isnan(guessfrac) or np.isinf(guessfrac):
guessfrac = 0
fitarr = []
coeffsarr = []
fit1 = fit_function(temp)
fitarr.append(fit1.fun)
coeffsarr.append(fit1.x)
fit1 = fit_function(temp, init=[0.1, guess, 0.5, guess2, guessfrac])
fitarr.append(fit1.fun)
coeffsarr.append(fit1.x)
fit1 = fit_function(temp, func="gp")
fitarr.append(fit1.fun)
coeffsarr.append(fit1.x)
fit1 = fit_function(temp, func="gp", init=[0.1, guess, guess2, guessfrac])
fitarr.append(fit1.fun)
coeffsarr.append(fit1.x)
fitarr = np.abs(np.asarray(fitarr))
best = np.argmin(fitarr)
orig = best
coeffs = np.copy(coeffsarr)
while np.abs(coeffsarr[best][0]) < 0.002:
print("Delta Found")
fitarr = np.delete(fitarr, best)
coeffsarr.pop(best)
if fitarr.size == 0:
best = np.argmin(fitarr)
else:
best = orig
coeffsarr = np.copy(coeffs)
coeffsarr[best][0] = 0.05
break
return (coeffsarr[best], orig)
# Inverted the args for this, so they match those used by scipy"s minmize.
# Minimize changes the coefficients making those the variables here.
[docs]def function_gp(params, x):
"""A Gaussian and Poisson hybrid function.
Parameters
----------
params : array_like
A list containing all of the defining parameters for the function.
The first item is the standard deviation of the Gaussian, the second is
the mean of the Gaussian, the third is the mean and variance
of the Poisson, and the fourth is a fraction that defines how biased
the function should be towards the Gaussian. For example, if the
provided fraction is 1 then the function is only a Gaussian,
whereas if the provided fraction is 0 the function is only the a
Poisson.
x : array_like
The x coordinates to plug into the function.
Returns
-------
numpy.ndarray
The function values corresponding to the input x values.
Notes
-----
The function is normalized so that the area under the curve from 0 to
infinity is equal to 1.
"""
sigma = params[0]
mu = params[1]
lamb = params[2]
frac = params[3]
# A is the normalization constant, here changed because 0 is a hard cutoff.
# So we normalize 0 to infinity rather than -infinity to infinity.
# Trapz is the integration method using the trapezoid rule.
x1 = np.arange(0, 400, 0.1)
A = 1 / np.trapz(np.exp(-((x1 - mu) ** 2) / (2 * sigma * sigma)), x1)
# This constrains frac to be between 0 and 1 using the hyperbolic tangent
frac = (np.tanh(frac) + 1) / 2
p1 = frac * A * np.exp(-((x - mu) ** 2) / (2 * sigma * sigma))
p2 = (1 - frac) * (lamb ** x / special.factorial(x)) * np.exp(-lamb)
return p1 + p2
[docs]def function_gg(params, x):
"""A double Gaussian hybrid function.
Parameters
----------
params : array_like
A list containing all of the defining parameters for the function.
The first item is the standard deviation of the first (lower) Gaussian,
the second is the mean of the first Gaussian, the third is the standard
deviation of the second (higher) Gaussian, the fourth is the mean of
the second Gaussian, and the fifth is a fraction that defines how
biased the function should be towards the first Gaussian.
For example, if the provided fraction is 1 then the function is only
the first Gaussian, whereas if the provided fraction is 0 the function
is only the second Gaussian.
x : array_like
The x coordinates to plug into the function.
Returns
-------
numpy.ndarray
The function values corresponding to the input x values.
Notes
-----
The function is normalized so that the area under the curve from 0 to
infinity is equal to 1.
"""
sigma1 = params[0]
mu1 = params[1]
sigma2 = params[2]
mu2 = params[3]
frac = params[4]
# A is the normalization constant, here changed because 0 is a hard cutoff.
# So we normalize 0 to infinity rather than -infinity to infinity.
# Trapz is the integration method using the trapezoid rule.
x1 = np.arange(0, 400, 0.1)
A = 1 / np.trapz(np.exp(-((x1 - mu1) ** 2) / (2 * sigma1 * sigma1)), x1)
B = 1 / np.trapz(np.exp(-((x1 - mu2) ** 2) / (2 * sigma2 * sigma2)), x1)
# This constrains frac to be between 0 and 1 using the hyperbolic tangent
frac = (np.tanh(frac) + 1) / 2
p1 = frac * A * np.exp(-((x - mu1) ** 2) / (2 * sigma1 * sigma1))
p2 = (1 - frac) * B * np.exp(-((x - mu2) ** 2) / (2 * sigma2 * sigma2))
return p1 + p2
def likelihood_gg(params, data):
"""The log likelihood value corresponding to the double Gaussian hybrid
function.
Parameters
----------
params : array_like
A list containing all of the defining parameters for the function.
The first item is the standard deviation of the first (lower) Gaussian,
the second is the mean of the first Gaussian, the third is the standard
deviation of the second (higher) Gaussian, the fourth is the mean of
the second Gaussian, and the fifth is a fraction that defines how
biased the function should be towards the first Gaussian.
For example, if frac = 1 then the function is only the first Gaussian,
whereas if frac = 0 the function is only the second Gaussian.
data : array_like
The x coordinates to plug into the function.
Returns
-------
float
The log likelihood value.
Notes
-----
The log likelihood of a function is calulated using a chi-squared analysis,
and thus is the negative sum of the natural logarithm of the function
values.
"""
# In chi-squared there"s a 2 in front but since we"re minimizing I"ve
# dropped it.
chi = -np.sum(np.log(function_gg(params, data)))
return chi
def likelihood_gp(params, data):
"""The log likelihood value corresponding to the Gaussian and Poisson hybrid
function.
params : array_like
A list containing all of the defining parameters for the function.
The first item is the standard deviation of the Gaussian, the second is
the mean of the Gaussian, the third is the mean and standard deviation
of the Poisson, and the fourth is a fraction that defines how biased
the function should be towards the Gaussian. For example, if frac = 1
then the function is only a Gaussian, whereas if frac = 0 the function
is only a Poisson.
data : array_like
The x coordinates to plug into the function.
Returns
-------
float
The log likelihood value.
Notes
-----
The log likelihood of a function is calulated using a chi-squared analysis,
and thus is the negative sum of the natural logarithm of the function
values.
"""
# In chi-squared there"s a 2 in front but since we"re minimizing I"ve
# dropped it.
chi = -np.sum(np.log(function_gp(params, data)))
return chi
# Fits the model to the data
# I abstracted this in case I need it somewhere else.
[docs]def fit_function(xdata, func="gg", init=None):
"""Fit a function to a histogram.
Parameters
----------
xdata : array_like
The cloudiness data used to compute a histogram.
func : str, optional
Which function to fit. "gg" corresponds to a double Gaussian hybrid,
and "gp" corresponds to a Gaussian and Poisson hybrid function.
Defaults to "gg".
init : list, optional
A list of initial guesses for the function fitting. Defaults to
None. If None, uses an initial guess of [0.1,0.5,0.5,3,0] for
a double Gaussian hybrid function, and [0.1,0.5,3,0] for a Gaussian
and Poisson hybrid function.
Returns
-------
scipy.optimize.OptimizeResult
An object corresponding to the fit function.
"""
if func == "gg":
likelihood = likelihood_gg
if init is None:
init = [0.1,0.5,0.5,3,0]
else:
likelihood = likelihood_gp
if init is None:
init = [0.1,0.5,3,0]
# Default for maxiter is N * 200 but that"s not enough in this case so we
# need to specify a higher value
fit = optimize.minimize(likelihood, x0=init, args=xdata,
method="Nelder-Mead",
options={"disp":False, "maxiter":1200})
return fit
# Saves the cloudiness data as a csv file.
[docs]def to_csv():
"""Convert the saved daily cloudiness data to a CSV file.
Notes
-----
The cloudiness values for every image saved using :func:`~analyze` are loaded and
converted to a single CSV file: data.csv.
The CSV file has 5 headers: Date & Time, Year, Week Number, Normalized Time
after Sunset and Cloudiness Relative to the Mean.
"""
directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed"])
months = sorted(os.listdir(directory))
# Macs are dumb
if ".DS_Store" in months:
months.remove(".DS_Store")
# Temp other stuff
normalnum = 50
normaldiv = 1. / normalnum
data = []
for month in months:
# Gets the days that were analyzed for that month
directory = os.path.join(os.path.dirname(__file__), *["data", "analyzed", month])
days = sorted(os.listdir(directory))
# Strips out the year from the month
year = month[:4]
# Day 1 of the year, for week calculation.
yearstart = year + "0101"
day1 = coordinates.timestring_to_obj(yearstart, "r_ut000000s00000")
# Reads the data for each day.
for day in days:
loc = os.path.join(directory, day)
f1 = open(loc, "r")
# Strips off the .txt so we can make a Time object.
day = day[:-4]
for line in f1:
linedata = []
# Splits out the value and file.
line = line.rstrip()
line = line.split(",")
val = float(line[1])
name = line[0]
# Moon phase calculation.
phase = moon.moon_visible(day, name)
# Ignores cloudiness with moon phase less than 0.2
if phase < 0.2:
continue
date = coordinates.timestring_to_obj(day, name)
# Finds the difference since the beginning of the year to find
# The week number.
diff = date - day1
week = int(diff.value // 7)
# Sunset time calculation.
# 12 hours after sunset for 50 bins = 0.01 of a day per bin.
formatdate = day[:4] + "/" + day[4:6] + "/" + day[6:]
time = name[4:6] + ":" + name[6:8] + ":" + name[8:10]
formatdate = formatdate + " " + time
linedata.append(formatdate)
linedata.append(year)
linedata.append(week)
# Sets the date of calculation.
camera.date = formatdate
date = ephem.Date(formatdate)
# Calculates the previous setting and next rising of the sun.
sun = ephem.Sun()
setting = camera.previous_setting(sun, use_center=True)
rising = camera.next_rising(sun, use_center=True)
# Finds the difference and bins it into the correct bin.
diff = date - setting
length = rising - setting
normalize = int((diff / length) // normaldiv)
linedata.append(normalize)
linedata.append(val)
data.append(np.asarray(linedata))
data = np.asarray(data)
d2 = pd.DataFrame(data, columns=["Date & Time", "Year", "Week Number",
"Normalized Time after Sunset",
"Cloudiness Relative to the Mean"])
d2.to_csv("data.csv")
if __name__ == "__main__":
#analyze()
plot(["2014", "2015", "2016", "2017"])