require 'torch'   -- torch
require 'optim'
require 'nn'      -- provides a normalization operator
local train_file_path = 'train.th7' 
local test_file_path = 'test.th7'
local train_data = torch.load(train_file_path)
local test_data = torch.load(test_file_path)
local train_labels = train_data[{{},{2,5}}]
local train_X = train_data[{{},{6,-1}}]
local test_labels = test_data[{{},{2,5}}]
local test_X = test_data[{{},{6,-1}}]
local batch_size = 30
model = nn.Sequential()                 -- define the container
ninputs = 350; noutputs = 4 ; nhiddens1 = 1024; nhiddens2 = 512; nhiddens3 = 256
--model:add(nn.Linear(ninputs, noutputs)) -- define the only module
model:add(nn.Linear(ninputs,nhiddens1))
model:add(nn.Sigmoid())
model:add(nn.Linear(nhiddens1,nhiddens2))
model:add(nn.Sigmoid())
model:add(nn.Linear(nhiddens2,nhiddens3))
model:add(nn.Sigmoid())
model:add(nn.Linear(nhiddens3,noutputs))
criterion = nn.AbsCriterion()--MSECriterion()
x, dl_dx = model:getParameters()

feval = function(x_new)
   if x ~= x_new then
      x:copy(x_new)
   end
   -- select a new training sample
   _nidx_ = (_nidx_ or 0) + 1
   if _nidx_ > (#train_data)[1] then _nidx_ = 1 end
   --local sample = data[_nidx_]
   local target = train_labels[_nidx_]      -- this funny looking syntax allows
   local inputs = train_X[_nidx_]    -- slicing of arrays.
   -- reset gradients (gradients are always accumulated, to accommodate 
   -- batch methods)
   dl_dx:zero()
   -- evaluate the loss function and its derivative wrt x, for that sample
   --print(inputs)
   --print(target)
   for i=1, 350 do
  if type(inputs[i]) ~= 'number' then
  print(i)
  print(inputs[i])
  print(type(inputs[i])) end
  end
   --io.write("continue with this operation (y/n)?")
   --answer=io.read()
   local loss_x = criterion:forward(model:forward(inputs), target)
   model:backward(inputs, criterion:backward(model.output, target))
   -- return loss(x) and dloss/dx
   return loss_x, dl_dx
end
-- Given the function above, we can now easily train the model using SGD.
-- For that, we need to define four key parameters:
--   + a learning rate: the size of the step taken at each stochastic 
--     estimate of the gradient
--   + a weight decay, to regularize the solution (L2 regularization)
--   + a momentum term, to average steps over time
--   + a learning rate decay, to let the algorithm converge more precisely
sgd_params = {
   learningRate = 0.01,
   learningRateDecay = 1e-08,
   weightDecay = 0,
   momentum = 0
}
-- We're now good to go... all we have left to do is run over the dataset
-- for a certain number of iterations, and perform a stochastic update 
-- at each iteration. The number of iterations is found empirically here,
-- but should typically be determinined using cross-validation.
-- we cycle 1e4 times over our training data
for i = 1,1 do
   print(i)
   -- this variable is used to estimate the average loss
   current_loss = 0
   -- an epoch is a full loop over our training data
   for i = 1,(#train_data)[1] do
      -- optim contains several optimization algorithms. 
      -- All of these algorithms assume the same parameters:
      --   + a closure that computes the loss, and its gradient wrt to x, 
      --     given a point x
      --   + a point x
      --   + some parameters, which are algorithm-specific      
      _,fs = optim.adagrad(feval,x,sgd_params)
      -- Functions in optim all return two things:
      --   + the new x, found by the optimization method (here SGD)
      --   + the value of the loss functions at all points that were used by
      --     the algorithm. SGD only estimates the function once, so
      --     that list just contains one value.
      current_loss = current_loss + fs[1]
   end
   -- report average error on epoch
   current_loss = current_loss / (#train_data)[1]
   print('train loss = ' .. current_loss)
   
end
----------------------------------------------------------------------
-- 5. Test the trained model.

-- Now that the model is trained, one can test it by evaluating it
-- on new samples.

-- The text solves the model exactly using matrix techniques and determines
-- that 
--   corn = 31.98 + 0.65 * fertilizer + 1.11 * insecticides

-- We compare our approximate results with the text's results.

print('id  approx   text')
local loss1 = 0.0
local loss2 = 0.0
local loss3 = 0.0
local loss4 = 0.0
for i = 1,(#test_data)[1] do
   local myPrediction = model:forward(test_X[i])
   loss1 = loss1+math.abs(myPrediction[1] - test_labels[i][1])
   loss2 = loss2+math.abs(myPrediction[2] - test_labels[i][2])
   loss3 = loss3+math.abs(myPrediction[3] - test_labels[i][3])
   loss4 = loss4+math.abs(myPrediction[4] - test_labels[i][4])
end

loss1 = loss1/(#test_data)[1]
loss2 = loss2/(#test_data)[1]
loss3 = loss3/(#test_data)[1]
loss4 = loss4/(#test_data)[1]

print(loss1,loss2,loss3,loss4)
torch.save('save.dat',model)