#!/usr/bin/env python

# create uberdata file, where every data point is on a row by itself,
# the way SPSS wants it to be

# output file has following fields:
# * uberdata file
#   * participant #
#   * case
#   * phase
#   * gesture name
#   * gesture index in its set
#   * delta time
#   * correct
#   - if incorrect:
#     * actual G name
#     * actual G index
#     - if actual G is real (i.e., not BLANK, etc.)
#       * rotation
#       * horiz flipped
#       * vert flipped
#       * backwards

import re, sys, string, os

progName = os.path.basename(sys.argv[0])
USAGE = '%s: usage: %s numbering cmFile1 [cmFile2 ...]\n' % (progName, progName)

if len(sys.argv) < 3:
    sys.stderr.write(USAGE)
    sys.exit(-1)

numberingFile = sys.argv[1]
files = sys.argv[2:]

def readNumbering(fileName):
    "returns a list of dictionaries, one for each case, keyed by gesture name, whose values are the indices"
    file = open(fileName)
    gNames = map(string.strip, string.split(string.strip(file.readline()), '\t'))
    lines = file.readlines()
    result = []
    for line in lines:
        dict = {}
        indices = map(string.atoi, string.split(string.strip(line), '\t')[1:])
        for i in range(len(indices)):
            dict[gNames[i]] = indices[i]
        result.append(dict)
    return result

def appendToDict(dict, key, value):
    if dict.has_key(key):
        dict[key].append(value)
    else:
        dict[key] = [ value ]
        
DATA_FILENAME_REGEX = re.compile("(?P<prefix>.*/)?(?P<pid>\d+)-(?P<phase>\d)_(?P<month>\d+)-(?P<day>\d+)(?P<rest>.*)")
def groupByPid(files):
    "returns a dictionary keyed by pid (as string) whose values are lists of file names for that pid"
    result = {}
    for file in files:
        match = DATA_FILENAME_REGEX.match(file)
        pid = match.group('pid')
        appendToDict(result, pid, file)
    return result

def getDate(fileName):
    return '%s-%s' % DATA_FILENAME_REGEX.match(fileName).group('month', 'day')

def computePhase(files):
    "return dict keyed by filename whose values are phases"
    pidDict = groupByPid(files)
    phaseDict = {}
    for pid in pidDict.keys():
        pidDict[pid].sort(lambda x,y: cmp(getDate(x), getDate(y)))
        firstDate = getDate(pidDict[pid][0])
        for file in pidDict[pid]:
            date = getDate(file)
            pseudophase = string.atoi(DATA_FILENAME_REGEX.match(file).group('phase'))
            if date == firstDate:
                phaseDict[file] = pseudophase - 1
            else:
                phaseDict[file] = 6 - pseudophase
    return phaseDict

numbering = readNumbering(numberingFile)

phaseDict = computePhase(files)

def pidToCase(pidStr):
    pid = string.atoi(pidStr)
    if pid < 100:
        return (pid - 1) % 3 + 1
    else:
        return (pid - 1) % 2 + 1

for fileName in files:
    file = open(fileName)
    phase = phaseDict[fileName]
    match = DATA_FILENAME_REGEX.match(fileName)
    pid = match.group('pid')
    case = pidToCase(pid)
    for line in file.readlines():
        fields = string.split(string.strip(line), '\t')
        (desiredName, timeSinceStart, correct) = fields[0:3]
        desiredIndex = numbering[case-1][desiredName]
        print '%s\t%s\t%d\t%s\t%d\t%s\t%s' % (pid, case, phase, desiredName, desiredIndex, timeSinceStart, correct),
        if correct == '0':
            (actualIndex, actualName) = fields[3:5]
            print '\t%s\t%s' % (actualName, actualIndex),
            if string.atoi(actualIndex) >= 0:
                (rotation, horizFlip, vertFlip, backwards) = fields[5:]
                print '\t%s\t%s\t%s\t%s' % (rotation, horizFlip, vertFlip, backwards),
        print

